OCaml HTML5 parser/serialiser based on Python's JustHTML

types

+7 -6
lib/htmlrw_check/checker.ml
··· 8 8 9 9 val start_element : 10 10 state -> 11 - name:string -> 12 - namespace:string option -> 13 - attrs:(string * string) list -> 11 + element:Element.t -> 14 12 Message_collector.t -> 15 13 unit 16 14 17 15 val end_element : 18 - state -> name:string -> namespace:string option -> Message_collector.t -> unit 16 + state -> 17 + tag:Tag.element_tag -> 18 + Message_collector.t -> 19 + unit 19 20 20 21 val characters : state -> string -> Message_collector.t -> unit 21 22 val end_document : state -> Message_collector.t -> unit ··· 30 31 let create () = () 31 32 let reset () = () 32 33 33 - let start_element () ~name:_ ~namespace:_ ~attrs:_ _ = () 34 - let end_element () ~name:_ ~namespace:_ _ = () 34 + let start_element () ~element:_ _ = () 35 + let end_element () ~tag:_ _ = () 35 36 let characters () _ _ = () 36 37 let end_document () _ = () 37 38 end
+9 -12
lib/htmlrw_check/checker.mli
··· 87 87 88 88 val start_element : 89 89 state -> 90 - name:string -> 91 - namespace:string option -> 92 - attrs:(string * string) list -> 90 + element:Element.t -> 93 91 Message_collector.t -> 94 92 unit 95 - (** [start_element state ~name ~namespace ~attrs collector] is called when 93 + (** [start_element state ~element collector] is called when 96 94 entering an element during DOM traversal. 97 95 98 96 @param state The checker state 99 - @param name The element tag name (e.g., "div", "p", "span") 100 - @param namespace The element namespace ([None] for HTML, [Some "svg"] 101 - for SVG, [Some "mathml"] for MathML) 102 - @param attrs The element's attributes as [(name, value)] pairs 97 + @param element The typed element (includes tag, typed attrs, and raw attrs) 103 98 @param collector The message collector for emitting validation messages 104 99 105 100 This is where checkers can validate: ··· 109 104 - Whether the element opens a new validation context *) 110 105 111 106 val end_element : 112 - state -> name:string -> namespace:string option -> Message_collector.t -> unit 113 - (** [end_element state ~name ~namespace collector] is called when exiting 107 + state -> 108 + tag:Tag.element_tag -> 109 + Message_collector.t -> 110 + unit 111 + (** [end_element state ~tag collector] is called when exiting 114 112 an element during DOM traversal. 115 113 116 114 @param state The checker state 117 - @param name The element tag name 118 - @param namespace The element namespace 115 + @param tag The element tag 119 116 @param collector The message collector for emitting validation messages 120 117 121 118 This is where checkers can:
+46 -16
lib/htmlrw_check/content_model/content_checker.ml
··· 2 2 name : string; 3 3 spec : Element_spec.t; 4 4 children_count : int; 5 + is_foreign : bool; (* SVG or MathML element *) 5 6 } 6 7 7 8 type state = { ··· 92 93 Message_collector.add_typed collector 93 94 (`Element (`Not_allowed_as_child (`Child child_name, `Parent parent.name))) 94 95 95 - let start_element state ~name ~namespace:_ ~attrs:_ collector = 96 - (* Look up element specification *) 97 - let spec_opt = Element_registry.get state.registry name in 96 + let start_element state ~element collector = 97 + let name = Tag.tag_to_string element.Element.tag in 98 + 99 + (* Check if we're inside a foreign (SVG/MathML) context *) 100 + let in_foreign_context = match state.ancestor_stack with 101 + | ctx :: _ -> ctx.is_foreign 102 + | [] -> false 103 + in 104 + 105 + (* Determine if this element is foreign content *) 106 + let is_foreign = match element.Element.tag with 107 + | Tag.Svg _ | Tag.MathML _ -> true 108 + | _ -> in_foreign_context (* Inherit from parent if inside foreign content *) 109 + in 110 + 111 + (* If entering foreign content from HTML, SVG/MathML are valid embedded content *) 112 + (* If already in foreign content, skip HTML content model checks *) 113 + if is_foreign && not in_foreign_context then begin 114 + (* Entering SVG/MathML from HTML - just track it, it's valid embedded content *) 115 + let spec = Element_spec.make ~name ~content_model:(Content_model.Categories [Content_category.Flow]) () in 116 + let context = { name; spec; children_count = 0; is_foreign = true } in 117 + state.ancestor_stack <- context :: state.ancestor_stack 118 + end else if is_foreign then begin 119 + (* Inside SVG/MathML - just track nesting, don't validate against HTML *) 120 + let spec = Element_spec.make ~name ~content_model:(Content_model.Categories [Content_category.Flow]) () in 121 + let context = { name; spec; children_count = 0; is_foreign = true } in 122 + state.ancestor_stack <- context :: state.ancestor_stack 123 + end else begin 124 + (* HTML element - do normal validation *) 125 + let spec_opt = Element_registry.get state.registry name in 98 126 99 - match spec_opt with 100 - | None -> 101 - (* Unknown element - first check if it's allowed in current context *) 102 - validate_child_element state name collector 103 - | Some spec -> 104 - (* Check prohibited ancestors *) 105 - check_prohibited_ancestors state name spec collector; 127 + match spec_opt with 128 + | None -> 129 + (* Unknown element - first check if it's allowed in current context *) 130 + validate_child_element state name collector 131 + | Some spec -> 132 + (* Check prohibited ancestors *) 133 + check_prohibited_ancestors state name spec collector; 106 134 107 - (* Validate this element is allowed as child of parent *) 108 - validate_child_element state name collector; 135 + (* Validate this element is allowed as child of parent *) 136 + validate_child_element state name collector; 109 137 110 - (* Push element context onto stack *) 111 - let context = { name; spec; children_count = 0 } in 112 - state.ancestor_stack <- context :: state.ancestor_stack 138 + (* Push element context onto stack *) 139 + let context = { name; spec; children_count = 0; is_foreign = false } in 140 + state.ancestor_stack <- context :: state.ancestor_stack 141 + end 113 142 114 - let end_element state ~name ~namespace:_ collector = 143 + let end_element state ~tag collector = 144 + let name = Tag.tag_to_string tag in 115 145 match state.ancestor_stack with 116 146 | [] -> 117 147 (* Unmatched closing tag *)
+14 -22
lib/htmlrw_check/dom_walker.ml
··· 15 15 16 16 (** Package a checker with its state for traversal. *) 17 17 type checker_state = { 18 - start_element : 19 - name:string -> 20 - namespace:string option -> 21 - attrs:(string * string) list -> 22 - Message_collector.t -> 23 - unit; 24 - end_element : 25 - name:string -> namespace:string option -> Message_collector.t -> unit; 18 + start_element : element:Element.t -> Message_collector.t -> unit; 19 + end_element : tag:Tag.element_tag -> Message_collector.t -> unit; 26 20 characters : string -> Message_collector.t -> unit; 27 21 end_document : Message_collector.t -> unit; 28 22 } ··· 31 25 let make_checker_state (module C : Checker.S) = 32 26 let state = C.create () in 33 27 { 34 - start_element = (fun ~name ~namespace ~attrs collector -> 35 - C.start_element state ~name ~namespace ~attrs collector); 36 - end_element = (fun ~name ~namespace collector -> 37 - C.end_element state ~name ~namespace collector); 28 + start_element = (fun ~element collector -> 29 + C.start_element state ~element collector); 30 + end_element = (fun ~tag collector -> 31 + C.end_element state ~tag collector); 38 32 characters = (fun text collector -> 39 33 C.characters state text collector); 40 34 end_document = (fun collector -> ··· 60 54 (* Doctype node: skip (no validation events for doctype) *) 61 55 () 62 56 | _ -> 63 - (* Element node: emit start, traverse children, emit end *) 64 - cs.start_element ~name:node.name ~namespace:node.namespace ~attrs:node.attrs collector; 57 + (* Element node: create typed element, emit start, traverse children, emit end *) 58 + let element = Element.create ~name:node.name ~namespace:node.namespace ~attrs:node.attrs in 59 + cs.start_element ~element collector; 65 60 List.iter (walk_node_single cs collector) node.children; 66 - cs.end_element ~name:node.name ~namespace:node.namespace collector 61 + cs.end_element ~tag:element.tag collector 67 62 68 63 let walk checker collector node = 69 64 let cs = make_checker_state checker in ··· 89 84 (* Doctype node: skip *) 90 85 () 91 86 | _ -> 92 - (* Element node: emit start to all checkers, traverse children, emit end to all *) 93 - List.iter (fun cs -> 94 - cs.start_element ~name:node.name ~namespace:node.namespace ~attrs:node.attrs collector 95 - ) css; 87 + (* Element node: create typed element, emit start to all checkers, traverse children, emit end to all *) 88 + let element = Element.create ~name:node.name ~namespace:node.namespace ~attrs:node.attrs in 89 + List.iter (fun cs -> cs.start_element ~element collector) css; 96 90 List.iter (walk_node_all css collector) node.children; 97 - List.iter (fun cs -> 98 - cs.end_element ~name:node.name ~namespace:node.namespace collector 99 - ) css 91 + List.iter (fun cs -> cs.end_element ~tag:element.tag collector) css 100 92 101 93 let walk_all checkers collector node = 102 94 (* Create checker state packages *)
+873
lib/htmlrw_check/element/attr.ml
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Typed HTML5 attribute representations using polymorphic variants. 7 + 8 + This module provides typed representations for HTML attributes with 9 + proper value types for enumerated attributes. *) 10 + 11 + (** {1 Attribute Value Types} *) 12 + 13 + (** Direction attribute values *) 14 + type dir_value = [ `Ltr | `Rtl | `Auto ] 15 + 16 + (** Hidden attribute values *) 17 + type hidden_value = [ `Hidden | `Until_found ] 18 + 19 + (** Popover attribute values *) 20 + type popover_value = [ `Auto | `Manual | `Hint ] 21 + 22 + (** Link target values *) 23 + type target_value = [ `Self | `Blank | `Parent | `Top | `Named of string ] 24 + 25 + (** Loading behavior values *) 26 + type loading_value = [ `Eager | `Lazy ] 27 + 28 + (** Decoding hint values *) 29 + type decoding_value = [ `Sync | `Async | `Auto ] 30 + 31 + (** Fetch priority values *) 32 + type fetchpriority_value = [ `High | `Low | `Auto ] 33 + 34 + (** CORS settings values *) 35 + type crossorigin_value = [ `Anonymous | `Use_credentials ] 36 + 37 + (** Preload hint values *) 38 + type preload_value = [ `None | `Metadata | `Auto ] 39 + 40 + (** Form method values *) 41 + type method_value = [ `Get | `Post | `Dialog ] 42 + 43 + (** Form enctype values *) 44 + type enctype_value = [ `Urlencoded | `Multipart | `Plain ] 45 + 46 + (** Textarea wrap values *) 47 + type wrap_value = [ `Soft | `Hard ] 48 + 49 + (** Table scope values *) 50 + type scope_value = [ `Row | `Col | `Rowgroup | `Colgroup ] 51 + 52 + (** Input type values *) 53 + type input_type = [ 54 + | `Hidden | `Text | `Search | `Tel | `Url | `Email | `Password 55 + | `Date | `Month | `Week | `Time | `Datetime_local | `Number 56 + | `Range | `Color | `Checkbox | `Radio | `File | `Submit 57 + | `Image | `Reset | `Button 58 + ] 59 + 60 + (** Button type values *) 61 + type button_type = [ `Submit | `Reset | `Button ] 62 + 63 + (** Referrer policy values *) 64 + type referrerpolicy_value = [ 65 + | `No_referrer | `No_referrer_when_downgrade | `Origin 66 + | `Origin_when_cross_origin | `Same_origin | `Strict_origin 67 + | `Strict_origin_when_cross_origin | `Unsafe_url 68 + ] 69 + 70 + (** Sandbox flag values *) 71 + type sandbox_flag = [ 72 + | `Allow_downloads | `Allow_forms | `Allow_modals | `Allow_orientation_lock 73 + | `Allow_pointer_lock | `Allow_popups | `Allow_popups_to_escape_sandbox 74 + | `Allow_presentation | `Allow_same_origin | `Allow_scripts 75 + | `Allow_top_navigation | `Allow_top_navigation_by_user_activation 76 + | `Allow_top_navigation_to_custom_protocols 77 + ] 78 + 79 + (** Enter key hint values *) 80 + type enterkeyhint_value = [ 81 + | `Enter | `Done | `Go | `Next | `Previous | `Search | `Send 82 + ] 83 + 84 + (** Input mode values *) 85 + type inputmode_value = [ 86 + | `None | `Text | `Decimal | `Numeric | `Tel | `Search | `Email | `Url 87 + ] 88 + 89 + (** Content editable values *) 90 + type contenteditable_value = [ `True | `False | `Plaintext_only ] 91 + 92 + (** Autocapitalize values *) 93 + type autocapitalize_value = [ 94 + | `Off | `None | `On | `Sentences | `Words | `Characters 95 + ] 96 + 97 + (** Image shape values *) 98 + type shape_value = [ `Rect | `Circle | `Poly | `Default ] 99 + 100 + (** Capture values *) 101 + type capture_value = [ `User | `Environment ] 102 + 103 + (** List type values *) 104 + type list_type_value = [ 105 + | `Decimal | `Lower_alpha | `Upper_alpha | `Lower_roman | `Upper_roman 106 + ] 107 + 108 + (** Track kind values *) 109 + type kind_value = [ 110 + | `Subtitles | `Captions | `Descriptions | `Chapters | `Metadata 111 + ] 112 + 113 + (** {1 Typed Attribute Variant} *) 114 + 115 + (** Typed attribute representation *) 116 + type t = [ 117 + (* Global attributes *) 118 + | `Id of string 119 + | `Class of string 120 + | `Style of string 121 + | `Title of string 122 + | `Lang of string 123 + | `Dir of dir_value 124 + | `Hidden of hidden_value option (* None = just "hidden" *) 125 + | `Tabindex of int 126 + | `Accesskey of string 127 + | `Autocapitalize of autocapitalize_value 128 + | `Autofocus 129 + | `Contenteditable of contenteditable_value option 130 + | `Draggable of bool 131 + | `Enterkeyhint of enterkeyhint_value 132 + | `Inert 133 + | `Inputmode of inputmode_value 134 + | `Is of string 135 + | `Nonce of string 136 + | `Popover of popover_value option 137 + | `Slot of string 138 + | `Spellcheck of bool option 139 + | `Translate of bool 140 + | `Exportparts of string 141 + | `Part of string 142 + 143 + (* Microdata *) 144 + | `Itemscope 145 + | `Itemtype of string 146 + | `Itemprop of string 147 + | `Itemid of string 148 + | `Itemref of string 149 + 150 + (* ARIA *) 151 + | `Role of string 152 + | `Aria of string * string (* aria-* -> (name, value) *) 153 + 154 + (* Event handlers *) 155 + | `Event of string * string (* onclick -> ("click", handler) *) 156 + 157 + (* Link/navigation attributes *) 158 + | `Href of string 159 + | `Target of target_value 160 + | `Rel of string 161 + | `Download of string option 162 + | `Hreflang of string 163 + | `Ping of string 164 + | `Referrerpolicy of referrerpolicy_value 165 + | `Type_link of string 166 + 167 + (* Media/resource attributes *) 168 + | `Src of string 169 + | `Srcset of string 170 + | `Sizes of string 171 + | `Alt of string 172 + | `Width of string 173 + | `Height of string 174 + | `Loading of loading_value 175 + | `Decoding of decoding_value 176 + | `Fetchpriority of fetchpriority_value 177 + | `Crossorigin of crossorigin_value option 178 + | `Ismap 179 + | `Usemap of string 180 + | `Media of string 181 + 182 + (* Audio/Video specific *) 183 + | `Controls 184 + | `Autoplay 185 + | `Loop 186 + | `Muted 187 + | `Preload of preload_value 188 + | `Poster of string 189 + | `Playsinline 190 + 191 + (* Image map *) 192 + | `Coords of string 193 + | `Shape of shape_value 194 + 195 + (* iframe *) 196 + | `Sandbox of sandbox_flag list option 197 + | `Allow of string 198 + | `Allowfullscreen 199 + | `Srcdoc of string 200 + | `Csp of string 201 + 202 + (* Form attributes *) 203 + | `Action of string 204 + | `Method of method_value 205 + | `Enctype of enctype_value 206 + | `Novalidate 207 + | `Accept_charset of string 208 + | `Autocomplete of string 209 + | `Name of string 210 + | `Form of string 211 + 212 + (* Form control attributes *) 213 + | `Value of string 214 + | `Type_input of input_type 215 + | `Type_button of button_type 216 + | `Disabled 217 + | `Readonly 218 + | `Required 219 + | `Checked 220 + | `Selected 221 + | `Multiple 222 + | `Placeholder of string 223 + | `Min of string 224 + | `Max of string 225 + | `Step of string 226 + | `Minlength of int 227 + | `Maxlength of int 228 + | `Pattern of string 229 + | `Size of int 230 + | `Cols of int 231 + | `Rows of int 232 + | `Wrap of wrap_value 233 + | `Accept of string 234 + | `Capture of capture_value 235 + | `Dirname of string 236 + | `For of string 237 + | `List of string 238 + 239 + (* Form submission attributes *) 240 + | `Formaction of string 241 + | `Formmethod of method_value 242 + | `Formenctype of enctype_value 243 + | `Formnovalidate 244 + | `Formtarget of target_value 245 + 246 + (* Table attributes *) 247 + | `Colspan of int 248 + | `Rowspan of int 249 + | `Headers of string 250 + | `Scope of scope_value 251 + | `Span of int 252 + 253 + (* Details/Dialog *) 254 + | `Open 255 + 256 + (* Script *) 257 + | `Async 258 + | `Defer 259 + | `Integrity of string 260 + | `Nomodule 261 + | `Blocking of string 262 + | `Type_script of string 263 + 264 + (* Meta *) 265 + | `Charset of string 266 + | `Content of string 267 + | `Http_equiv of string 268 + 269 + (* Link element *) 270 + | `As of string 271 + | `Imagesizes of string 272 + | `Imagesrcset of string 273 + 274 + (* Object/Embed *) 275 + | `Data_object of string 276 + 277 + (* Output *) 278 + | `For_output of string 279 + 280 + (* Meter/Progress *) 281 + | `Low of float 282 + | `High of float 283 + | `Optimum of float 284 + 285 + (* Time *) 286 + | `Datetime of string 287 + 288 + (* Ol *) 289 + | `Start of int 290 + | `Reversed 291 + | `Type_list of list_type_value 292 + 293 + (* Track *) 294 + | `Kind of kind_value 295 + | `Srclang of string 296 + | `Default 297 + 298 + (* Td/Th *) 299 + | `Abbr of string 300 + 301 + (* Data attributes *) 302 + | `Data_attr of string * string 303 + 304 + (* RDFa *) 305 + | `Property of string 306 + | `Typeof of string 307 + | `Resource of string 308 + | `Prefix of string 309 + | `Vocab of string 310 + | `About of string 311 + | `Datatype of string 312 + | `Inlist 313 + | `Rev of string 314 + 315 + (* Escape hatch *) 316 + | `Unknown_attr of string * string 317 + ] 318 + 319 + (** {1 Parsing Functions} *) 320 + 321 + (** Parse dir value *) 322 + let parse_dir = function 323 + | "ltr" -> Some `Ltr 324 + | "rtl" -> Some `Rtl 325 + | "auto" -> Some `Auto 326 + | _ -> None 327 + 328 + (** Parse target value *) 329 + let parse_target = function 330 + | "_self" -> `Self 331 + | "_blank" -> `Blank 332 + | "_parent" -> `Parent 333 + | "_top" -> `Top 334 + | s -> `Named s 335 + 336 + (** Parse loading value *) 337 + let parse_loading = function 338 + | "eager" -> Some `Eager 339 + | "lazy" -> Some `Lazy 340 + | _ -> None 341 + 342 + (** Parse decoding value *) 343 + let parse_decoding = function 344 + | "sync" -> Some `Sync 345 + | "async" -> Some `Async 346 + | "auto" -> Some `Auto 347 + | _ -> None 348 + 349 + (** Parse fetchpriority value *) 350 + let parse_fetchpriority = function 351 + | "high" -> Some `High 352 + | "low" -> Some `Low 353 + | "auto" -> Some `Auto 354 + | _ -> None 355 + 356 + (** Parse crossorigin value *) 357 + let parse_crossorigin = function 358 + | "anonymous" | "" -> Some `Anonymous 359 + | "use-credentials" -> Some `Use_credentials 360 + | _ -> None 361 + 362 + (** Parse preload value *) 363 + let parse_preload = function 364 + | "none" -> Some `None 365 + | "metadata" -> Some `Metadata 366 + | "auto" | "" -> Some `Auto 367 + | _ -> None 368 + 369 + (** Parse method value *) 370 + let parse_method = function 371 + | "get" -> Some `Get 372 + | "post" -> Some `Post 373 + | "dialog" -> Some `Dialog 374 + | _ -> None 375 + 376 + (** Parse enctype value *) 377 + let parse_enctype = function 378 + | "application/x-www-form-urlencoded" -> Some `Urlencoded 379 + | "multipart/form-data" -> Some `Multipart 380 + | "text/plain" -> Some `Plain 381 + | _ -> None 382 + 383 + (** Parse wrap value *) 384 + let parse_wrap = function 385 + | "soft" -> Some `Soft 386 + | "hard" -> Some `Hard 387 + | _ -> None 388 + 389 + (** Parse scope value *) 390 + let parse_scope = function 391 + | "row" -> Some `Row 392 + | "col" -> Some `Col 393 + | "rowgroup" -> Some `Rowgroup 394 + | "colgroup" -> Some `Colgroup 395 + | _ -> None 396 + 397 + (** Parse input type value *) 398 + let parse_input_type = function 399 + | "hidden" -> Some `Hidden 400 + | "text" -> Some `Text 401 + | "search" -> Some `Search 402 + | "tel" -> Some `Tel 403 + | "url" -> Some `Url 404 + | "email" -> Some `Email 405 + | "password" -> Some `Password 406 + | "date" -> Some `Date 407 + | "month" -> Some `Month 408 + | "week" -> Some `Week 409 + | "time" -> Some `Time 410 + | "datetime-local" -> Some `Datetime_local 411 + | "number" -> Some `Number 412 + | "range" -> Some `Range 413 + | "color" -> Some `Color 414 + | "checkbox" -> Some `Checkbox 415 + | "radio" -> Some `Radio 416 + | "file" -> Some `File 417 + | "submit" -> Some `Submit 418 + | "image" -> Some `Image 419 + | "reset" -> Some `Reset 420 + | "button" -> Some `Button 421 + | _ -> None 422 + 423 + (** Parse button type value *) 424 + let parse_button_type = function 425 + | "submit" -> Some `Submit 426 + | "reset" -> Some `Reset 427 + | "button" -> Some `Button 428 + | _ -> None 429 + 430 + (** Parse shape value *) 431 + let parse_shape = function 432 + | "rect" -> Some `Rect 433 + | "circle" -> Some `Circle 434 + | "poly" -> Some `Poly 435 + | "default" -> Some `Default 436 + | _ -> None 437 + 438 + (** Parse capture value *) 439 + let parse_capture = function 440 + | "user" -> Some `User 441 + | "environment" -> Some `Environment 442 + | _ -> None 443 + 444 + (** Parse list type value *) 445 + let parse_list_type = function 446 + | "1" -> Some `Decimal 447 + | "a" -> Some `Lower_alpha 448 + | "A" -> Some `Upper_alpha 449 + | "i" -> Some `Lower_roman 450 + | "I" -> Some `Upper_roman 451 + | _ -> None 452 + 453 + (** Parse kind value *) 454 + let parse_kind = function 455 + | "subtitles" -> Some `Subtitles 456 + | "captions" -> Some `Captions 457 + | "descriptions" -> Some `Descriptions 458 + | "chapters" -> Some `Chapters 459 + | "metadata" -> Some `Metadata 460 + | _ -> None 461 + 462 + (** Parse referrerpolicy value *) 463 + let parse_referrerpolicy = function 464 + | "no-referrer" -> Some `No_referrer 465 + | "no-referrer-when-downgrade" -> Some `No_referrer_when_downgrade 466 + | "origin" -> Some `Origin 467 + | "origin-when-cross-origin" -> Some `Origin_when_cross_origin 468 + | "same-origin" -> Some `Same_origin 469 + | "strict-origin" -> Some `Strict_origin 470 + | "strict-origin-when-cross-origin" -> Some `Strict_origin_when_cross_origin 471 + | "unsafe-url" -> Some `Unsafe_url 472 + | _ -> None 473 + 474 + (** Parse sandbox flag *) 475 + let parse_sandbox_flag = function 476 + | "allow-downloads" -> Some `Allow_downloads 477 + | "allow-forms" -> Some `Allow_forms 478 + | "allow-modals" -> Some `Allow_modals 479 + | "allow-orientation-lock" -> Some `Allow_orientation_lock 480 + | "allow-pointer-lock" -> Some `Allow_pointer_lock 481 + | "allow-popups" -> Some `Allow_popups 482 + | "allow-popups-to-escape-sandbox" -> Some `Allow_popups_to_escape_sandbox 483 + | "allow-presentation" -> Some `Allow_presentation 484 + | "allow-same-origin" -> Some `Allow_same_origin 485 + | "allow-scripts" -> Some `Allow_scripts 486 + | "allow-top-navigation" -> Some `Allow_top_navigation 487 + | "allow-top-navigation-by-user-activation" -> Some `Allow_top_navigation_by_user_activation 488 + | "allow-top-navigation-to-custom-protocols" -> Some `Allow_top_navigation_to_custom_protocols 489 + | _ -> None 490 + 491 + (** Parse sandbox value (space-separated flags) *) 492 + let parse_sandbox value = 493 + if String.trim value = "" then 494 + Some [] 495 + else 496 + let flags = String.split_on_char ' ' value |> List.filter (fun s -> s <> "") in 497 + let parsed = List.filter_map parse_sandbox_flag flags in 498 + if List.length parsed = List.length flags then 499 + Some parsed 500 + else 501 + None 502 + 503 + (** Parse enterkeyhint value *) 504 + let parse_enterkeyhint = function 505 + | "enter" -> Some `Enter 506 + | "done" -> Some `Done 507 + | "go" -> Some `Go 508 + | "next" -> Some `Next 509 + | "previous" -> Some `Previous 510 + | "search" -> Some `Search 511 + | "send" -> Some `Send 512 + | _ -> None 513 + 514 + (** Parse inputmode value *) 515 + let parse_inputmode = function 516 + | "none" -> Some `None 517 + | "text" -> Some `Text 518 + | "decimal" -> Some `Decimal 519 + | "numeric" -> Some `Numeric 520 + | "tel" -> Some `Tel 521 + | "search" -> Some `Search 522 + | "email" -> Some `Email 523 + | "url" -> Some `Url 524 + | _ -> None 525 + 526 + (** Parse contenteditable value *) 527 + let parse_contenteditable = function 528 + | "true" | "" -> Some `True 529 + | "false" -> Some `False 530 + | "plaintext-only" -> Some `Plaintext_only 531 + | _ -> None 532 + 533 + (** Parse autocapitalize value *) 534 + let parse_autocapitalize = function 535 + | "off" -> Some `Off 536 + | "none" -> Some `None 537 + | "on" -> Some `On 538 + | "sentences" -> Some `Sentences 539 + | "words" -> Some `Words 540 + | "characters" -> Some `Characters 541 + | _ -> None 542 + 543 + (** Parse hidden value *) 544 + let parse_hidden = function 545 + | "" | "hidden" -> Some `Hidden 546 + | "until-found" -> Some `Until_found 547 + | _ -> None 548 + 549 + (** Parse popover value *) 550 + let parse_popover = function 551 + | "" | "auto" -> Some `Auto 552 + | "manual" -> Some `Manual 553 + | "hint" -> Some `Hint 554 + | _ -> None 555 + 556 + (** Try to parse an integer *) 557 + let parse_int s = 558 + try Some (int_of_string (String.trim s)) 559 + with Failure _ -> None 560 + 561 + (** Try to parse a float *) 562 + let parse_float s = 563 + try Some (float_of_string (String.trim s)) 564 + with Failure _ -> None 565 + 566 + (** Parse a boolean string *) 567 + let parse_bool = function 568 + | "true" | "" -> Some true 569 + | "false" -> Some false 570 + | _ -> None 571 + 572 + (** Parse a single attribute name-value pair to typed attribute *) 573 + let parse_attr name value : t = 574 + let name_lower = String.lowercase_ascii name in 575 + let value_lower = String.lowercase_ascii value in 576 + match name_lower with 577 + (* Global attributes *) 578 + | "id" -> `Id value 579 + | "class" -> `Class value 580 + | "style" -> `Style value 581 + | "title" -> `Title value 582 + | "lang" -> `Lang value 583 + | "dir" -> (match parse_dir value_lower with Some d -> `Dir d | None -> `Unknown_attr (name, value)) 584 + | "hidden" -> `Hidden (parse_hidden value_lower) 585 + | "tabindex" -> (match parse_int value with Some i -> `Tabindex i | None -> `Unknown_attr (name, value)) 586 + | "accesskey" -> `Accesskey value 587 + | "autocapitalize" -> (match parse_autocapitalize value_lower with Some a -> `Autocapitalize a | None -> `Unknown_attr (name, value)) 588 + | "autofocus" -> `Autofocus 589 + | "contenteditable" -> `Contenteditable (parse_contenteditable value_lower) 590 + | "draggable" -> (match parse_bool value_lower with Some b -> `Draggable b | None -> `Unknown_attr (name, value)) 591 + | "enterkeyhint" -> (match parse_enterkeyhint value_lower with Some e -> `Enterkeyhint e | None -> `Unknown_attr (name, value)) 592 + | "inert" -> `Inert 593 + | "inputmode" -> (match parse_inputmode value_lower with Some i -> `Inputmode i | None -> `Unknown_attr (name, value)) 594 + | "is" -> `Is value 595 + | "nonce" -> `Nonce value 596 + | "popover" -> `Popover (parse_popover value_lower) 597 + | "slot" -> `Slot value 598 + | "spellcheck" -> `Spellcheck (parse_bool value_lower) 599 + | "translate" -> (match value_lower with "yes" | "" -> `Translate true | "no" -> `Translate false | _ -> `Unknown_attr (name, value)) 600 + | "exportparts" -> `Exportparts value 601 + | "part" -> `Part value 602 + 603 + (* Microdata *) 604 + | "itemscope" -> `Itemscope 605 + | "itemtype" -> `Itemtype value 606 + | "itemprop" -> `Itemprop value 607 + | "itemid" -> `Itemid value 608 + | "itemref" -> `Itemref value 609 + 610 + (* ARIA - role and aria-* *) 611 + | "role" -> `Role value 612 + | _ when String.starts_with ~prefix:"aria-" name_lower -> 613 + let aria_name = String.sub name_lower 5 (String.length name_lower - 5) in 614 + `Aria (aria_name, value) 615 + 616 + (* Event handlers - on* *) 617 + | _ when String.starts_with ~prefix:"on" name_lower && String.length name_lower > 2 -> 618 + let event_name = String.sub name_lower 2 (String.length name_lower - 2) in 619 + `Event (event_name, value) 620 + 621 + (* Link/navigation attributes *) 622 + | "href" -> `Href value 623 + | "target" -> `Target (parse_target value) 624 + | "rel" -> `Rel value 625 + | "download" -> `Download (if value = "" then None else Some value) 626 + | "hreflang" -> `Hreflang value 627 + | "ping" -> `Ping value 628 + | "referrerpolicy" -> (match parse_referrerpolicy value_lower with Some r -> `Referrerpolicy r | None -> `Unknown_attr (name, value)) 629 + 630 + (* Media/resource attributes *) 631 + | "src" -> `Src value 632 + | "srcset" -> `Srcset value 633 + | "sizes" -> `Sizes value 634 + | "alt" -> `Alt value 635 + | "width" -> `Width value 636 + | "height" -> `Height value 637 + | "loading" -> (match parse_loading value_lower with Some l -> `Loading l | None -> `Unknown_attr (name, value)) 638 + | "decoding" -> (match parse_decoding value_lower with Some d -> `Decoding d | None -> `Unknown_attr (name, value)) 639 + | "fetchpriority" -> (match parse_fetchpriority value_lower with Some f -> `Fetchpriority f | None -> `Unknown_attr (name, value)) 640 + | "crossorigin" -> `Crossorigin (parse_crossorigin value_lower) 641 + | "ismap" -> `Ismap 642 + | "usemap" -> `Usemap value 643 + | "media" -> `Media value 644 + 645 + (* Audio/Video specific *) 646 + | "controls" -> `Controls 647 + | "autoplay" -> `Autoplay 648 + | "loop" -> `Loop 649 + | "muted" -> `Muted 650 + | "preload" -> (match parse_preload value_lower with Some p -> `Preload p | None -> `Unknown_attr (name, value)) 651 + | "poster" -> `Poster value 652 + | "playsinline" -> `Playsinline 653 + 654 + (* Image map *) 655 + | "coords" -> `Coords value 656 + | "shape" -> (match parse_shape value_lower with Some s -> `Shape s | None -> `Unknown_attr (name, value)) 657 + 658 + (* iframe *) 659 + | "sandbox" -> `Sandbox (parse_sandbox value_lower) 660 + | "allow" -> `Allow value 661 + | "allowfullscreen" -> `Allowfullscreen 662 + | "srcdoc" -> `Srcdoc value 663 + | "csp" -> `Csp value 664 + 665 + (* Form attributes *) 666 + | "action" -> `Action value 667 + | "method" -> (match parse_method value_lower with Some m -> `Method m | None -> `Unknown_attr (name, value)) 668 + | "enctype" -> (match parse_enctype value_lower with Some e -> `Enctype e | None -> `Unknown_attr (name, value)) 669 + | "novalidate" -> `Novalidate 670 + | "accept-charset" -> `Accept_charset value 671 + | "autocomplete" -> `Autocomplete value 672 + | "name" -> `Name value 673 + | "form" -> `Form value 674 + 675 + (* Form control attributes *) 676 + | "value" -> `Value value 677 + | "type" -> `Unknown_attr (name, value) (* type is context-dependent, handle in element parsing *) 678 + | "disabled" -> `Disabled 679 + | "readonly" -> `Readonly 680 + | "required" -> `Required 681 + | "checked" -> `Checked 682 + | "selected" -> `Selected 683 + | "multiple" -> `Multiple 684 + | "placeholder" -> `Placeholder value 685 + | "min" -> `Min value 686 + | "max" -> `Max value 687 + | "step" -> `Step value 688 + | "minlength" -> (match parse_int value with Some i -> `Minlength i | None -> `Unknown_attr (name, value)) 689 + | "maxlength" -> (match parse_int value with Some i -> `Maxlength i | None -> `Unknown_attr (name, value)) 690 + | "pattern" -> `Pattern value 691 + | "size" -> (match parse_int value with Some i -> `Size i | None -> `Unknown_attr (name, value)) 692 + | "cols" -> (match parse_int value with Some i -> `Cols i | None -> `Unknown_attr (name, value)) 693 + | "rows" -> (match parse_int value with Some i -> `Rows i | None -> `Unknown_attr (name, value)) 694 + | "wrap" -> (match parse_wrap value_lower with Some w -> `Wrap w | None -> `Unknown_attr (name, value)) 695 + | "accept" -> `Accept value 696 + | "capture" -> (match parse_capture value_lower with Some c -> `Capture c | None -> `Unknown_attr (name, value)) 697 + | "dirname" -> `Dirname value 698 + | "for" -> `For value 699 + | "list" -> `List value 700 + 701 + (* Form submission attributes *) 702 + | "formaction" -> `Formaction value 703 + | "formmethod" -> (match parse_method value_lower with Some m -> `Formmethod m | None -> `Unknown_attr (name, value)) 704 + | "formenctype" -> (match parse_enctype value_lower with Some e -> `Formenctype e | None -> `Unknown_attr (name, value)) 705 + | "formnovalidate" -> `Formnovalidate 706 + | "formtarget" -> `Formtarget (parse_target value) 707 + 708 + (* Table attributes *) 709 + | "colspan" -> (match parse_int value with Some i -> `Colspan i | None -> `Unknown_attr (name, value)) 710 + | "rowspan" -> (match parse_int value with Some i -> `Rowspan i | None -> `Unknown_attr (name, value)) 711 + | "headers" -> `Headers value 712 + | "scope" -> (match parse_scope value_lower with Some s -> `Scope s | None -> `Unknown_attr (name, value)) 713 + | "span" -> (match parse_int value with Some i -> `Span i | None -> `Unknown_attr (name, value)) 714 + 715 + (* Details/Dialog *) 716 + | "open" -> `Open 717 + 718 + (* Script *) 719 + | "async" -> `Async 720 + | "defer" -> `Defer 721 + | "integrity" -> `Integrity value 722 + | "nomodule" -> `Nomodule 723 + | "blocking" -> `Blocking value 724 + 725 + (* Meta *) 726 + | "charset" -> `Charset value 727 + | "content" -> `Content value 728 + | "http-equiv" -> `Http_equiv value 729 + 730 + (* Link element *) 731 + | "as" -> `As value 732 + | "imagesizes" -> `Imagesizes value 733 + | "imagesrcset" -> `Imagesrcset value 734 + 735 + (* Object *) 736 + | "data" -> `Data_object value 737 + 738 + (* Meter/Progress *) 739 + | "low" -> (match parse_float value with Some f -> `Low f | None -> `Unknown_attr (name, value)) 740 + | "high" -> (match parse_float value with Some f -> `High f | None -> `Unknown_attr (name, value)) 741 + | "optimum" -> (match parse_float value with Some f -> `Optimum f | None -> `Unknown_attr (name, value)) 742 + 743 + (* Time *) 744 + | "datetime" -> `Datetime value 745 + 746 + (* Ol *) 747 + | "start" -> (match parse_int value with Some i -> `Start i | None -> `Unknown_attr (name, value)) 748 + | "reversed" -> `Reversed 749 + 750 + (* Track *) 751 + | "kind" -> (match parse_kind value_lower with Some k -> `Kind k | None -> `Unknown_attr (name, value)) 752 + | "srclang" -> `Srclang value 753 + | "default" -> `Default 754 + 755 + (* Td/Th *) 756 + | "abbr" -> `Abbr value 757 + 758 + (* RDFa *) 759 + | "property" -> `Property value 760 + | "typeof" -> `Typeof value 761 + | "resource" -> `Resource value 762 + | "prefix" -> `Prefix value 763 + | "vocab" -> `Vocab value 764 + | "about" -> `About value 765 + | "datatype" -> `Datatype value 766 + | "inlist" -> `Inlist 767 + | "rev" -> `Rev value 768 + 769 + (* Data attributes *) 770 + | _ when String.starts_with ~prefix:"data-" name_lower -> 771 + let data_name = String.sub name_lower 5 (String.length name_lower - 5) in 772 + `Data_attr (data_name, value) 773 + 774 + (* Escape hatch *) 775 + | _ -> `Unknown_attr (name, value) 776 + 777 + (** Parse multiple attributes *) 778 + let parse_attrs (attrs : (string * string) list) : t list = 779 + List.map (fun (n, v) -> parse_attr n v) attrs 780 + 781 + (** {1 Accessor Functions} *) 782 + 783 + (** Get id attribute *) 784 + let get_id attrs = 785 + List.find_map (function `Id s -> Some s | _ -> None) attrs 786 + 787 + (** Get class attribute *) 788 + let get_class attrs = 789 + List.find_map (function `Class s -> Some s | _ -> None) attrs 790 + 791 + (** Get href attribute *) 792 + let get_href attrs = 793 + List.find_map (function `Href s -> Some s | _ -> None) attrs 794 + 795 + (** Get src attribute *) 796 + let get_src attrs = 797 + List.find_map (function `Src s -> Some s | _ -> None) attrs 798 + 799 + (** Get alt attribute *) 800 + let get_alt attrs = 801 + List.find_map (function `Alt s -> Some s | _ -> None) attrs 802 + 803 + (** Get name attribute *) 804 + let get_name attrs = 805 + List.find_map (function `Name s -> Some s | _ -> None) attrs 806 + 807 + (** Get value attribute *) 808 + let get_value attrs = 809 + List.find_map (function `Value s -> Some s | _ -> None) attrs 810 + 811 + (** Get role attribute *) 812 + let get_role attrs = 813 + List.find_map (function `Role s -> Some s | _ -> None) attrs 814 + 815 + (** Get a specific aria-* attribute *) 816 + let get_aria name attrs = 817 + List.find_map (function `Aria (n, v) when n = name -> Some v | _ -> None) attrs 818 + 819 + (** Get a specific data-* attribute *) 820 + let get_data name attrs = 821 + List.find_map (function `Data_attr (n, v) when n = name -> Some v | _ -> None) attrs 822 + 823 + (** Check if disabled is present *) 824 + let has_disabled attrs = 825 + List.exists (function `Disabled -> true | _ -> false) attrs 826 + 827 + (** Check if required is present *) 828 + let has_required attrs = 829 + List.exists (function `Required -> true | _ -> false) attrs 830 + 831 + (** Check if readonly is present *) 832 + let has_readonly attrs = 833 + List.exists (function `Readonly -> true | _ -> false) attrs 834 + 835 + (** Check if checked is present *) 836 + let has_checked attrs = 837 + List.exists (function `Checked -> true | _ -> false) attrs 838 + 839 + (** Check if autofocus is present *) 840 + let has_autofocus attrs = 841 + List.exists (function `Autofocus -> true | _ -> false) attrs 842 + 843 + (** Check if hidden is present *) 844 + let has_hidden attrs = 845 + List.exists (function `Hidden _ -> true | _ -> false) attrs 846 + 847 + (** Check if inert is present *) 848 + let has_inert attrs = 849 + List.exists (function `Inert -> true | _ -> false) attrs 850 + 851 + (** Check if open is present *) 852 + let has_open attrs = 853 + List.exists (function `Open -> true | _ -> false) attrs 854 + 855 + (** Get all aria-* attributes *) 856 + let get_all_aria attrs = 857 + List.filter_map (function `Aria (n, v) -> Some (n, v) | _ -> None) attrs 858 + 859 + (** Get all data-* attributes *) 860 + let get_all_data attrs = 861 + List.filter_map (function `Data_attr (n, v) -> Some (n, v) | _ -> None) attrs 862 + 863 + (** Find an attribute matching a predicate *) 864 + let find f attrs = 865 + List.find_map f attrs 866 + 867 + (** Check if any attribute matches *) 868 + let exists f attrs = 869 + List.exists f attrs 870 + 871 + (** Filter attributes *) 872 + let filter f attrs = 873 + List.filter f attrs
+289
lib/htmlrw_check/element/element.ml
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Typed HTML5 element representation. 7 + 8 + This module combines tags and attributes into a complete typed element 9 + representation with conversion functions. *) 10 + 11 + (** {1 Element Type} *) 12 + 13 + (** A typed HTML element *) 14 + type t = { 15 + tag : Tag.element_tag; 16 + attrs : Attr.t list; 17 + raw_attrs : (string * string) list; (** Original for fallback *) 18 + } 19 + 20 + (** {1 Parsing Functions} *) 21 + 22 + (** Parse element-specific type attribute based on tag *) 23 + let parse_type_attr (tag : Tag.html_tag) value : Attr.t = 24 + let value_lower = String.lowercase_ascii value in 25 + match tag with 26 + | `Input -> 27 + (match Attr.parse_input_type value_lower with 28 + | Some t -> `Type_input t 29 + | None -> `Unknown_attr ("type", value)) 30 + | `Button -> 31 + (match Attr.parse_button_type value_lower with 32 + | Some t -> `Type_button t 33 + | None -> `Unknown_attr ("type", value)) 34 + | `Script -> `Type_script value 35 + | `Link -> `Type_link value 36 + | `Ol -> 37 + (match Attr.parse_list_type value_lower with 38 + | Some t -> `Type_list t 39 + | None -> `Unknown_attr ("type", value)) 40 + | _ -> `Unknown_attr ("type", value) 41 + 42 + (** Parse attributes with element context for type attribute *) 43 + let parse_attrs_for_tag (tag : Tag.element_tag) (raw_attrs : (string * string) list) : Attr.t list = 44 + List.map (fun (name, value) -> 45 + let name_lower = String.lowercase_ascii name in 46 + if name_lower = "type" then 47 + match tag with 48 + | Tag.Html html_tag -> parse_type_attr html_tag value 49 + | _ -> `Unknown_attr (name, value) 50 + else 51 + Attr.parse_attr name value 52 + ) raw_attrs 53 + 54 + (** Create an element from raw input *) 55 + let create ~name ~namespace ~attrs:raw_attrs = 56 + let tag = Tag.tag_of_string ?namespace name in 57 + let attrs = parse_attrs_for_tag tag raw_attrs in 58 + { tag; attrs; raw_attrs } 59 + 60 + (** {1 Accessor Functions} *) 61 + 62 + (** Get the tag *) 63 + let tag elem = elem.tag 64 + 65 + (** Get typed attributes *) 66 + let attrs elem = elem.attrs 67 + 68 + (** Get raw attributes *) 69 + let raw_attrs elem = elem.raw_attrs 70 + 71 + (** Get the tag name as string *) 72 + let tag_name elem = Tag.tag_to_string elem.tag 73 + 74 + (** Check if element is a specific HTML tag *) 75 + let is_html_tag expected elem = 76 + Tag.is_html_tag expected elem.tag 77 + 78 + (** Get the HTML tag if this is an HTML element *) 79 + let as_html_tag elem = 80 + Tag.as_html_tag elem.tag 81 + 82 + (** {1 Attribute Accessors (delegated to Attr module)} *) 83 + 84 + let get_id elem = Attr.get_id elem.attrs 85 + let get_class elem = Attr.get_class elem.attrs 86 + let get_href elem = Attr.get_href elem.attrs 87 + let get_src elem = Attr.get_src elem.attrs 88 + let get_alt elem = Attr.get_alt elem.attrs 89 + let get_name elem = Attr.get_name elem.attrs 90 + let get_value elem = Attr.get_value elem.attrs 91 + let get_role elem = Attr.get_role elem.attrs 92 + let get_aria name elem = Attr.get_aria name elem.attrs 93 + let get_data name elem = Attr.get_data name elem.attrs 94 + 95 + let has_disabled elem = Attr.has_disabled elem.attrs 96 + let has_required elem = Attr.has_required elem.attrs 97 + let has_readonly elem = Attr.has_readonly elem.attrs 98 + let has_checked elem = Attr.has_checked elem.attrs 99 + let has_autofocus elem = Attr.has_autofocus elem.attrs 100 + let has_hidden elem = Attr.has_hidden elem.attrs 101 + let has_inert elem = Attr.has_inert elem.attrs 102 + let has_open elem = Attr.has_open elem.attrs 103 + 104 + let get_all_aria elem = Attr.get_all_aria elem.attrs 105 + let get_all_data elem = Attr.get_all_data elem.attrs 106 + 107 + (** {1 Category Checks} *) 108 + 109 + (** Check if this is a void element *) 110 + let is_void elem = 111 + match elem.tag with 112 + | Tag.Html t -> Tag.is_void t 113 + | _ -> false 114 + 115 + (** Check if this is a heading element *) 116 + let is_heading elem = 117 + match elem.tag with 118 + | Tag.Html t -> Tag.is_heading t 119 + | _ -> false 120 + 121 + (** Get heading level (1-6) or None *) 122 + let heading_level elem = 123 + match elem.tag with 124 + | Tag.Html t -> Tag.heading_level t 125 + | _ -> None 126 + 127 + (** Check if this is sectioning content *) 128 + let is_sectioning elem = 129 + match elem.tag with 130 + | Tag.Html t -> Tag.is_sectioning t 131 + | _ -> false 132 + 133 + (** Check if this is a sectioning root *) 134 + let is_sectioning_root elem = 135 + match elem.tag with 136 + | Tag.Html t -> Tag.is_sectioning_root t 137 + | _ -> false 138 + 139 + (** Check if this is embedded content *) 140 + let is_embedded elem = 141 + match elem.tag with 142 + | Tag.Html t -> Tag.is_embedded t 143 + | _ -> false 144 + 145 + (** Check if this is interactive content *) 146 + let is_interactive elem = 147 + match elem.tag with 148 + | Tag.Html t -> Tag.is_interactive t 149 + | _ -> false 150 + 151 + (** Check if this is form-associated *) 152 + let is_form_associated elem = 153 + match elem.tag with 154 + | Tag.Html t -> Tag.is_form_associated t 155 + | _ -> false 156 + 157 + (** Check if this is labelable *) 158 + let is_labelable elem = 159 + match elem.tag with 160 + | Tag.Html t -> Tag.is_labelable t 161 + | _ -> false 162 + 163 + (** Check if this is submittable *) 164 + let is_submittable elem = 165 + match elem.tag with 166 + | Tag.Html t -> Tag.is_submittable t 167 + | _ -> false 168 + 169 + (** Check if this is a table element *) 170 + let is_table_element elem = 171 + match elem.tag with 172 + | Tag.Html t -> Tag.is_table_element t 173 + | _ -> false 174 + 175 + (** Check if this is a media element *) 176 + let is_media elem = 177 + match elem.tag with 178 + | Tag.Html t -> Tag.is_media t 179 + | _ -> false 180 + 181 + (** Check if this is a list container *) 182 + let is_list_container elem = 183 + match elem.tag with 184 + | Tag.Html t -> Tag.is_list_container t 185 + | _ -> false 186 + 187 + (** Check if this has transparent content model *) 188 + let is_transparent elem = 189 + match elem.tag with 190 + | Tag.Html t -> Tag.is_transparent t 191 + | _ -> false 192 + 193 + (** Check if this is phrasing content *) 194 + let is_phrasing elem = 195 + match elem.tag with 196 + | Tag.Html t -> Tag.is_phrasing t 197 + | _ -> false 198 + 199 + (** Check if this is flow content *) 200 + let is_flow elem = 201 + match elem.tag with 202 + | Tag.Html t -> Tag.is_flow t 203 + | _ -> true (* Custom elements are flow content *) 204 + 205 + (** Check if this is a deprecated element *) 206 + let is_obsolete elem = 207 + match elem.tag with 208 + | Tag.Html t -> Tag.is_obsolete t 209 + | _ -> false 210 + 211 + (** Check if this is an SVG element *) 212 + let is_svg elem = 213 + match elem.tag with 214 + | Tag.Svg _ -> true 215 + | _ -> false 216 + 217 + (** Check if this is a MathML element *) 218 + let is_mathml elem = 219 + match elem.tag with 220 + | Tag.MathML _ -> true 221 + | _ -> false 222 + 223 + (** Check if this is a custom element *) 224 + let is_custom elem = 225 + match elem.tag with 226 + | Tag.Custom _ -> true 227 + | _ -> false 228 + 229 + (** Check if this is an unknown element *) 230 + let is_unknown elem = 231 + match elem.tag with 232 + | Tag.Unknown _ -> true 233 + | _ -> false 234 + 235 + (** {1 Input Type Utilities} *) 236 + 237 + (** Get input type for input elements *) 238 + let get_input_type elem = 239 + match elem.tag with 240 + | Tag.Html `Input -> 241 + List.find_map (function 242 + | `Type_input t -> Some t 243 + | _ -> None 244 + ) elem.attrs 245 + | _ -> None 246 + 247 + (** Get button type for button elements *) 248 + let get_button_type elem = 249 + match elem.tag with 250 + | Tag.Html `Button -> 251 + List.find_map (function 252 + | `Type_button t -> Some t 253 + | _ -> None 254 + ) elem.attrs 255 + | _ -> None 256 + 257 + (** Check if input is of a specific type *) 258 + let is_input_type expected elem = 259 + match get_input_type elem with 260 + | Some t -> t = expected 261 + | None -> false 262 + 263 + (** {1 Raw Attribute Fallback} *) 264 + 265 + (** Get raw attribute value (from original attrs) *) 266 + let get_raw_attr name elem = 267 + List.find_map (fun (n, v) -> 268 + if String.lowercase_ascii n = String.lowercase_ascii name then Some v else None 269 + ) elem.raw_attrs 270 + 271 + (** Check if raw attribute exists *) 272 + let has_raw_attr name elem = 273 + List.exists (fun (n, _) -> 274 + String.lowercase_ascii n = String.lowercase_ascii name 275 + ) elem.raw_attrs 276 + 277 + (** {1 Pattern Matching Helpers} *) 278 + 279 + (** Match on HTML tag or return None *) 280 + let match_html elem f = 281 + match elem.tag with 282 + | Tag.Html tag -> Some (f tag) 283 + | _ -> None 284 + 285 + (** Match on specific HTML tag *) 286 + let when_html_tag expected elem f = 287 + match elem.tag with 288 + | Tag.Html tag when tag = expected -> Some (f ()) 289 + | _ -> None
+523
lib/htmlrw_check/element/tag.ml
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Typed HTML5 tag representations using polymorphic variants. 7 + 8 + This module provides compile-time type safety for HTML elements while 9 + maintaining escape hatches for unknown/custom elements. *) 10 + 11 + (** {1 HTML Tag Types} *) 12 + 13 + (** All standard HTML5 elements plus deprecated elements needed by the validator *) 14 + type html_tag = [ 15 + (* Document metadata *) 16 + | `Html | `Head | `Title | `Base | `Link | `Meta | `Style 17 + 18 + (* Sectioning root *) 19 + | `Body 20 + 21 + (* Content sectioning *) 22 + | `Address | `Article | `Aside | `Footer | `Header | `Hgroup 23 + | `Main | `Nav | `Search | `Section 24 + 25 + (* Heading content *) 26 + | `H1 | `H2 | `H3 | `H4 | `H5 | `H6 27 + 28 + (* Grouping content *) 29 + | `Blockquote | `Dd | `Div | `Dl | `Dt | `Figcaption | `Figure 30 + | `Hr | `Li | `Menu | `Ol | `P | `Pre | `Ul 31 + 32 + (* Text-level semantics *) 33 + | `A | `Abbr | `B | `Bdi | `Bdo | `Br | `Cite | `Code | `Data 34 + | `Dfn | `Em | `I | `Kbd | `Mark | `Q | `Rp | `Rt | `Ruby 35 + | `S | `Samp | `Small | `Span | `Strong | `Sub | `Sup | `Time 36 + | `U | `Var | `Wbr 37 + 38 + (* Edits *) 39 + | `Del | `Ins 40 + 41 + (* Embedded content *) 42 + | `Area | `Audio | `Canvas | `Embed | `Iframe | `Img | `Map | `Object 43 + | `Picture | `Source | `Track | `Video 44 + 45 + (* Tabular data *) 46 + | `Caption | `Col | `Colgroup | `Table | `Tbody | `Td | `Tfoot 47 + | `Th | `Thead | `Tr 48 + 49 + (* Forms *) 50 + | `Button | `Datalist | `Fieldset | `Form | `Input | `Label 51 + | `Legend | `Meter | `Optgroup | `Option | `Output | `Progress 52 + | `Select | `Textarea 53 + 54 + (* Interactive elements *) 55 + | `Details | `Dialog | `Summary 56 + 57 + (* Scripting *) 58 + | `Noscript | `Script | `Slot | `Template 59 + 60 + (* Web Components / Misc *) 61 + | `Portal | `Param 62 + 63 + (* Deprecated/obsolete elements (needed by validator) *) 64 + | `Applet | `Acronym | `Bgsound | `Dir | `Frame | `Frameset 65 + | `Noframes | `Isindex | `Keygen | `Listing | `Menuitem | `Nextid 66 + | `Noembed | `Plaintext | `Rb | `Rtc | `Strike | `Xmp 67 + | `Basefont | `Big | `Blink | `Center | `Font | `Marquee 68 + | `Multicol | `Nobr | `Spacer | `Tt | `Image 69 + ] 70 + 71 + (** {1 Category Types} 72 + 73 + Categories as type aliases for subsets, enabling functions that only accept 74 + specific categories with compile-time checking. *) 75 + 76 + (** Void elements - cannot have children *) 77 + type void_tag = [ 78 + | `Area | `Base | `Br | `Col | `Embed | `Hr | `Img | `Input 79 + | `Link | `Meta | `Source | `Track | `Wbr 80 + (* Deprecated void elements *) 81 + | `Basefont | `Frame | `Isindex | `Keygen | `Param 82 + ] 83 + 84 + (** Heading elements *) 85 + type heading_tag = [ `H1 | `H2 | `H3 | `H4 | `H5 | `H6 ] 86 + 87 + (** Sectioning content *) 88 + type sectioning_tag = [ `Article | `Aside | `Nav | `Section ] 89 + 90 + (** Sectioning roots (establish their own outline) *) 91 + type sectioning_root_tag = [ 92 + | `Blockquote | `Body | `Details | `Dialog | `Fieldset | `Figure | `Td 93 + ] 94 + 95 + (** Embedded content *) 96 + type embedded_tag = [ 97 + | `Audio | `Canvas | `Embed | `Iframe | `Img | `Object | `Picture | `Video 98 + ] 99 + 100 + (** Interactive content (focusable/activatable) *) 101 + type interactive_tag = [ 102 + | `A | `Audio | `Button | `Details | `Embed | `Iframe | `Img 103 + | `Input | `Label | `Select | `Textarea | `Video 104 + ] 105 + 106 + (** Form-associated elements *) 107 + type form_associated_tag = [ 108 + | `Button | `Fieldset | `Input | `Label | `Object | `Output 109 + | `Select | `Textarea | `Meter | `Progress 110 + ] 111 + 112 + (** Labelable elements *) 113 + type labelable_tag = [ 114 + | `Button | `Input | `Meter | `Output | `Progress | `Select | `Textarea 115 + ] 116 + 117 + (** Submittable elements *) 118 + type submittable_tag = [ 119 + | `Button | `Input | `Select | `Textarea 120 + ] 121 + 122 + (** Resettable elements *) 123 + type resettable_tag = [ 124 + | `Input | `Output | `Select | `Textarea 125 + ] 126 + 127 + (** Table elements *) 128 + type table_tag = [ 129 + | `Caption | `Col | `Colgroup | `Table | `Tbody | `Td | `Tfoot 130 + | `Th | `Thead | `Tr 131 + ] 132 + 133 + (** Media elements *) 134 + type media_tag = [ `Audio | `Video ] 135 + 136 + (** List container elements *) 137 + type list_container_tag = [ `Ul | `Ol | `Menu | `Dl ] 138 + 139 + (** List item elements *) 140 + type list_item_tag = [ `Li | `Dd | `Dt ] 141 + 142 + (** Script-supporting elements *) 143 + type script_supporting_tag = [ `Script | `Template ] 144 + 145 + (** Metadata content *) 146 + type metadata_tag = [ `Base | `Link | `Meta | `Noscript | `Script | `Style | `Template | `Title ] 147 + 148 + (** {1 Top-Level Element Type} *) 149 + 150 + (** Top-level element classification *) 151 + type element_tag = 152 + | Html of html_tag (** Known HTML5 element *) 153 + | Svg of string (** SVG element by local name *) 154 + | MathML of string (** MathML element by local name *) 155 + | Custom of string (** Custom element like <my-widget> *) 156 + | Unknown of string (** Truly unknown element *) 157 + 158 + (** {1 Conversion Functions} *) 159 + 160 + (** Convert a lowercase tag name string to html_tag option *) 161 + let html_tag_of_string_opt name = 162 + match name with 163 + (* Document metadata *) 164 + | "html" -> Some `Html | "head" -> Some `Head | "title" -> Some `Title 165 + | "base" -> Some `Base | "link" -> Some `Link | "meta" -> Some `Meta 166 + | "style" -> Some `Style 167 + (* Sectioning root *) 168 + | "body" -> Some `Body 169 + (* Content sectioning *) 170 + | "address" -> Some `Address | "article" -> Some `Article | "aside" -> Some `Aside 171 + | "footer" -> Some `Footer | "header" -> Some `Header | "hgroup" -> Some `Hgroup 172 + | "main" -> Some `Main | "nav" -> Some `Nav | "search" -> Some `Search 173 + | "section" -> Some `Section 174 + (* Headings *) 175 + | "h1" -> Some `H1 | "h2" -> Some `H2 | "h3" -> Some `H3 176 + | "h4" -> Some `H4 | "h5" -> Some `H5 | "h6" -> Some `H6 177 + (* Grouping content *) 178 + | "blockquote" -> Some `Blockquote | "dd" -> Some `Dd | "div" -> Some `Div 179 + | "dl" -> Some `Dl | "dt" -> Some `Dt | "figcaption" -> Some `Figcaption 180 + | "figure" -> Some `Figure | "hr" -> Some `Hr | "li" -> Some `Li 181 + | "menu" -> Some `Menu | "ol" -> Some `Ol | "p" -> Some `P 182 + | "pre" -> Some `Pre | "ul" -> Some `Ul 183 + (* Text-level semantics *) 184 + | "a" -> Some `A | "abbr" -> Some `Abbr | "b" -> Some `B 185 + | "bdi" -> Some `Bdi | "bdo" -> Some `Bdo | "br" -> Some `Br 186 + | "cite" -> Some `Cite | "code" -> Some `Code | "data" -> Some `Data 187 + | "dfn" -> Some `Dfn | "em" -> Some `Em | "i" -> Some `I 188 + | "kbd" -> Some `Kbd | "mark" -> Some `Mark | "q" -> Some `Q 189 + | "rp" -> Some `Rp | "rt" -> Some `Rt | "ruby" -> Some `Ruby 190 + | "s" -> Some `S | "samp" -> Some `Samp | "small" -> Some `Small 191 + | "span" -> Some `Span | "strong" -> Some `Strong | "sub" -> Some `Sub 192 + | "sup" -> Some `Sup | "time" -> Some `Time | "u" -> Some `U 193 + | "var" -> Some `Var | "wbr" -> Some `Wbr 194 + (* Edits *) 195 + | "del" -> Some `Del | "ins" -> Some `Ins 196 + (* Embedded content *) 197 + | "area" -> Some `Area | "audio" -> Some `Audio | "canvas" -> Some `Canvas 198 + | "embed" -> Some `Embed | "iframe" -> Some `Iframe | "img" -> Some `Img 199 + | "map" -> Some `Map | "object" -> Some `Object | "picture" -> Some `Picture 200 + | "source" -> Some `Source | "track" -> Some `Track | "video" -> Some `Video 201 + (* Tabular data *) 202 + | "caption" -> Some `Caption | "col" -> Some `Col | "colgroup" -> Some `Colgroup 203 + | "table" -> Some `Table | "tbody" -> Some `Tbody | "td" -> Some `Td 204 + | "tfoot" -> Some `Tfoot | "th" -> Some `Th | "thead" -> Some `Thead 205 + | "tr" -> Some `Tr 206 + (* Forms *) 207 + | "button" -> Some `Button | "datalist" -> Some `Datalist 208 + | "fieldset" -> Some `Fieldset | "form" -> Some `Form | "input" -> Some `Input 209 + | "label" -> Some `Label | "legend" -> Some `Legend | "meter" -> Some `Meter 210 + | "optgroup" -> Some `Optgroup | "option" -> Some `Option 211 + | "output" -> Some `Output | "progress" -> Some `Progress 212 + | "select" -> Some `Select | "textarea" -> Some `Textarea 213 + (* Interactive *) 214 + | "details" -> Some `Details | "dialog" -> Some `Dialog | "summary" -> Some `Summary 215 + (* Scripting *) 216 + | "noscript" -> Some `Noscript | "script" -> Some `Script 217 + | "slot" -> Some `Slot | "template" -> Some `Template 218 + (* Web Components / Misc *) 219 + | "portal" -> Some `Portal | "param" -> Some `Param 220 + (* Deprecated/obsolete elements *) 221 + | "applet" -> Some `Applet | "acronym" -> Some `Acronym | "bgsound" -> Some `Bgsound 222 + | "dir" -> Some `Dir | "frame" -> Some `Frame | "frameset" -> Some `Frameset 223 + | "noframes" -> Some `Noframes | "isindex" -> Some `Isindex | "keygen" -> Some `Keygen 224 + | "listing" -> Some `Listing | "menuitem" -> Some `Menuitem | "nextid" -> Some `Nextid 225 + | "noembed" -> Some `Noembed | "plaintext" -> Some `Plaintext 226 + | "rb" -> Some `Rb | "rtc" -> Some `Rtc | "strike" -> Some `Strike | "xmp" -> Some `Xmp 227 + | "basefont" -> Some `Basefont | "big" -> Some `Big | "blink" -> Some `Blink 228 + | "center" -> Some `Center | "font" -> Some `Font | "marquee" -> Some `Marquee 229 + | "multicol" -> Some `Multicol | "nobr" -> Some `Nobr | "spacer" -> Some `Spacer 230 + | "tt" -> Some `Tt | "image" -> Some `Image 231 + | _ -> None 232 + 233 + (** Check if a name is a valid custom element name (contains hyphen, not reserved) *) 234 + let is_custom_element_name name = 235 + String.contains name '-' && 236 + not (String.starts_with ~prefix:"xml" (String.lowercase_ascii name)) && 237 + not (String.equal (String.lowercase_ascii name) "annotation-xml") 238 + 239 + (** SVG namespace URI *) 240 + let svg_namespace = "http://www.w3.org/2000/svg" 241 + 242 + (** MathML namespace URI *) 243 + let mathml_namespace = "http://www.w3.org/1998/Math/MathML" 244 + 245 + (** Check if namespace is SVG (accepts both short and full URI) *) 246 + let is_svg_namespace = function 247 + | "svg" | "http://www.w3.org/2000/svg" -> true 248 + | _ -> false 249 + 250 + (** Check if namespace is MathML (accepts both short and full URI) *) 251 + let is_mathml_namespace = function 252 + | "mathml" | "http://www.w3.org/1998/Math/MathML" -> true 253 + | _ -> false 254 + 255 + (** Convert tag name and optional namespace to element_tag *) 256 + let tag_of_string ?namespace name = 257 + let name_lower = String.lowercase_ascii name in 258 + match namespace with 259 + | Some ns when is_svg_namespace ns -> Svg name_lower 260 + | Some ns when is_mathml_namespace ns -> MathML name_lower 261 + | Some _ -> Unknown name_lower (* Unknown namespace *) 262 + | None -> 263 + match html_tag_of_string_opt name_lower with 264 + | Some tag -> Html tag 265 + | None -> 266 + if is_custom_element_name name_lower then 267 + Custom name_lower 268 + else 269 + Unknown name_lower 270 + 271 + (** Convert html_tag to string *) 272 + let html_tag_to_string (tag : html_tag) : string = 273 + match tag with 274 + (* Document metadata *) 275 + | `Html -> "html" | `Head -> "head" | `Title -> "title" 276 + | `Base -> "base" | `Link -> "link" | `Meta -> "meta" | `Style -> "style" 277 + (* Sectioning root *) 278 + | `Body -> "body" 279 + (* Content sectioning *) 280 + | `Address -> "address" | `Article -> "article" | `Aside -> "aside" 281 + | `Footer -> "footer" | `Header -> "header" | `Hgroup -> "hgroup" 282 + | `Main -> "main" | `Nav -> "nav" | `Search -> "search" | `Section -> "section" 283 + (* Headings *) 284 + | `H1 -> "h1" | `H2 -> "h2" | `H3 -> "h3" 285 + | `H4 -> "h4" | `H5 -> "h5" | `H6 -> "h6" 286 + (* Grouping content *) 287 + | `Blockquote -> "blockquote" | `Dd -> "dd" | `Div -> "div" 288 + | `Dl -> "dl" | `Dt -> "dt" | `Figcaption -> "figcaption" 289 + | `Figure -> "figure" | `Hr -> "hr" | `Li -> "li" 290 + | `Menu -> "menu" | `Ol -> "ol" | `P -> "p" | `Pre -> "pre" | `Ul -> "ul" 291 + (* Text-level semantics *) 292 + | `A -> "a" | `Abbr -> "abbr" | `B -> "b" 293 + | `Bdi -> "bdi" | `Bdo -> "bdo" | `Br -> "br" 294 + | `Cite -> "cite" | `Code -> "code" | `Data -> "data" 295 + | `Dfn -> "dfn" | `Em -> "em" | `I -> "i" 296 + | `Kbd -> "kbd" | `Mark -> "mark" | `Q -> "q" 297 + | `Rp -> "rp" | `Rt -> "rt" | `Ruby -> "ruby" 298 + | `S -> "s" | `Samp -> "samp" | `Small -> "small" 299 + | `Span -> "span" | `Strong -> "strong" | `Sub -> "sub" 300 + | `Sup -> "sup" | `Time -> "time" | `U -> "u" 301 + | `Var -> "var" | `Wbr -> "wbr" 302 + (* Edits *) 303 + | `Del -> "del" | `Ins -> "ins" 304 + (* Embedded content *) 305 + | `Area -> "area" | `Audio -> "audio" | `Canvas -> "canvas" 306 + | `Embed -> "embed" | `Iframe -> "iframe" | `Img -> "img" 307 + | `Map -> "map" | `Object -> "object" | `Picture -> "picture" 308 + | `Source -> "source" | `Track -> "track" | `Video -> "video" 309 + (* Tabular data *) 310 + | `Caption -> "caption" | `Col -> "col" | `Colgroup -> "colgroup" 311 + | `Table -> "table" | `Tbody -> "tbody" | `Td -> "td" 312 + | `Tfoot -> "tfoot" | `Th -> "th" | `Thead -> "thead" | `Tr -> "tr" 313 + (* Forms *) 314 + | `Button -> "button" | `Datalist -> "datalist" 315 + | `Fieldset -> "fieldset" | `Form -> "form" | `Input -> "input" 316 + | `Label -> "label" | `Legend -> "legend" | `Meter -> "meter" 317 + | `Optgroup -> "optgroup" | `Option -> "option" 318 + | `Output -> "output" | `Progress -> "progress" 319 + | `Select -> "select" | `Textarea -> "textarea" 320 + (* Interactive *) 321 + | `Details -> "details" | `Dialog -> "dialog" | `Summary -> "summary" 322 + (* Scripting *) 323 + | `Noscript -> "noscript" | `Script -> "script" 324 + | `Slot -> "slot" | `Template -> "template" 325 + (* Web Components / Misc *) 326 + | `Portal -> "portal" | `Param -> "param" 327 + (* Deprecated/obsolete elements *) 328 + | `Applet -> "applet" | `Acronym -> "acronym" | `Bgsound -> "bgsound" 329 + | `Dir -> "dir" | `Frame -> "frame" | `Frameset -> "frameset" 330 + | `Noframes -> "noframes" | `Isindex -> "isindex" | `Keygen -> "keygen" 331 + | `Listing -> "listing" | `Menuitem -> "menuitem" | `Nextid -> "nextid" 332 + | `Noembed -> "noembed" | `Plaintext -> "plaintext" 333 + | `Rb -> "rb" | `Rtc -> "rtc" | `Strike -> "strike" | `Xmp -> "xmp" 334 + | `Basefont -> "basefont" | `Big -> "big" | `Blink -> "blink" 335 + | `Center -> "center" | `Font -> "font" | `Marquee -> "marquee" 336 + | `Multicol -> "multicol" | `Nobr -> "nobr" | `Spacer -> "spacer" 337 + | `Tt -> "tt" | `Image -> "image" 338 + 339 + (** Convert element_tag to string *) 340 + let tag_to_string = function 341 + | Html tag -> html_tag_to_string tag 342 + | Svg name -> name 343 + | MathML name -> name 344 + | Custom name -> name 345 + | Unknown name -> name 346 + 347 + (** {1 Category Predicates} *) 348 + 349 + (** Check if element is a void element *) 350 + let is_void (tag : html_tag) : bool = 351 + match tag with 352 + | `Area | `Base | `Br | `Col | `Embed | `Hr | `Img | `Input 353 + | `Link | `Meta | `Source | `Track | `Wbr 354 + | `Basefont | `Frame | `Isindex | `Keygen | `Param -> true 355 + | _ -> false 356 + 357 + (** Check if element is a heading *) 358 + let is_heading (tag : html_tag) : bool = 359 + match tag with 360 + | `H1 | `H2 | `H3 | `H4 | `H5 | `H6 -> true 361 + | _ -> false 362 + 363 + (** Get heading level (1-6) or None *) 364 + let heading_level (tag : html_tag) : int option = 365 + match tag with 366 + | `H1 -> Some 1 | `H2 -> Some 2 | `H3 -> Some 3 367 + | `H4 -> Some 4 | `H5 -> Some 5 | `H6 -> Some 6 368 + | _ -> None 369 + 370 + (** Check if element is sectioning content *) 371 + let is_sectioning (tag : html_tag) : bool = 372 + match tag with 373 + | `Article | `Aside | `Nav | `Section -> true 374 + | _ -> false 375 + 376 + (** Check if element is a sectioning root *) 377 + let is_sectioning_root (tag : html_tag) : bool = 378 + match tag with 379 + | `Blockquote | `Body | `Details | `Dialog | `Fieldset | `Figure | `Td -> true 380 + | _ -> false 381 + 382 + (** Check if element is embedded content *) 383 + let is_embedded (tag : html_tag) : bool = 384 + match tag with 385 + | `Audio | `Canvas | `Embed | `Iframe | `Img | `Object | `Picture | `Video -> true 386 + | _ -> false 387 + 388 + (** Check if element is interactive content *) 389 + let is_interactive (tag : html_tag) : bool = 390 + match tag with 391 + | `A | `Audio | `Button | `Details | `Embed | `Iframe | `Img 392 + | `Input | `Label | `Select | `Textarea | `Video -> true 393 + | _ -> false 394 + 395 + (** Check if element is form-associated *) 396 + let is_form_associated (tag : html_tag) : bool = 397 + match tag with 398 + | `Button | `Fieldset | `Input | `Label | `Object | `Output 399 + | `Select | `Textarea | `Meter | `Progress -> true 400 + | _ -> false 401 + 402 + (** Check if element is labelable *) 403 + let is_labelable (tag : html_tag) : bool = 404 + match tag with 405 + | `Button | `Input | `Meter | `Output | `Progress | `Select | `Textarea -> true 406 + | _ -> false 407 + 408 + (** Check if element is submittable *) 409 + let is_submittable (tag : html_tag) : bool = 410 + match tag with 411 + | `Button | `Input | `Select | `Textarea -> true 412 + | _ -> false 413 + 414 + (** Check if element is resettable *) 415 + let is_resettable (tag : html_tag) : bool = 416 + match tag with 417 + | `Input | `Output | `Select | `Textarea -> true 418 + | _ -> false 419 + 420 + (** Check if element has transparent content model *) 421 + let is_transparent (tag : html_tag) : bool = 422 + match tag with 423 + | `A | `Abbr | `Audio | `Canvas | `Del | `Ins | `Map | `Noscript 424 + | `Object | `Slot | `Video -> true 425 + | _ -> false 426 + 427 + (** Check if element is script-supporting *) 428 + let is_script_supporting (tag : html_tag) : bool = 429 + match tag with 430 + | `Script | `Template -> true 431 + | _ -> false 432 + 433 + (** Check if element is a table element *) 434 + let is_table_element (tag : html_tag) : bool = 435 + match tag with 436 + | `Caption | `Col | `Colgroup | `Table | `Tbody | `Td | `Tfoot 437 + | `Th | `Thead | `Tr -> true 438 + | _ -> false 439 + 440 + (** Check if element is a media element *) 441 + let is_media (tag : html_tag) : bool = 442 + match tag with 443 + | `Audio | `Video -> true 444 + | _ -> false 445 + 446 + (** Check if element is a list container *) 447 + let is_list_container (tag : html_tag) : bool = 448 + match tag with 449 + | `Ul | `Ol | `Menu | `Dl -> true 450 + | _ -> false 451 + 452 + (** Check if element is a list item *) 453 + let is_list_item (tag : html_tag) : bool = 454 + match tag with 455 + | `Li | `Dd | `Dt -> true 456 + | _ -> false 457 + 458 + (** Check if element is metadata content *) 459 + let is_metadata (tag : html_tag) : bool = 460 + match tag with 461 + | `Base | `Link | `Meta | `Noscript | `Script | `Style | `Template | `Title -> true 462 + | _ -> false 463 + 464 + (** Check if element is a deprecated/obsolete element *) 465 + let is_obsolete (tag : html_tag) : bool = 466 + match tag with 467 + | `Applet | `Acronym | `Bgsound | `Dir | `Frame | `Frameset 468 + | `Noframes | `Isindex | `Keygen | `Listing | `Menuitem | `Nextid 469 + | `Noembed | `Plaintext | `Rb | `Rtc | `Strike | `Xmp 470 + | `Basefont | `Big | `Blink | `Center | `Font | `Marquee 471 + | `Multicol | `Nobr | `Spacer | `Tt | `Image -> true 472 + | _ -> false 473 + 474 + (** Check if element is a raw text element (script, style) *) 475 + let is_raw_text (tag : html_tag) : bool = 476 + match tag with 477 + | `Script | `Style -> true 478 + | _ -> false 479 + 480 + (** Check if element is an escapable raw text element (textarea, title) *) 481 + let is_escapable_raw_text (tag : html_tag) : bool = 482 + match tag with 483 + | `Textarea | `Title -> true 484 + | _ -> false 485 + 486 + (** Check if element is a phrasing content element *) 487 + let is_phrasing (tag : html_tag) : bool = 488 + match tag with 489 + | `A | `Abbr | `Audio | `B | `Bdi | `Bdo | `Br | `Button | `Canvas 490 + | `Cite | `Code | `Data | `Datalist | `Del | `Dfn | `Em | `Embed 491 + | `I | `Iframe | `Img | `Input | `Ins | `Kbd | `Label | `Map | `Mark 492 + | `Meter | `Noscript | `Object | `Output | `Picture | `Progress | `Q 493 + | `Ruby | `S | `Samp | `Script | `Select | `Slot | `Small | `Span 494 + | `Strong | `Sub | `Sup | `Template | `Textarea | `Time | `U | `Var 495 + | `Video | `Wbr 496 + (* Deprecated phrasing *) 497 + | `Acronym | `Big | `Blink | `Font | `Marquee | `Nobr | `Spacer | `Tt -> true 498 + | _ -> false 499 + 500 + (** Check if element is flow content *) 501 + let is_flow (tag : html_tag) : bool = 502 + match tag with 503 + (* Most elements are flow content *) 504 + | `Html | `Head | `Title | `Base | `Link | `Meta | `Style -> false 505 + | `Body -> false 506 + | `Caption | `Col | `Colgroup | `Tbody | `Td | `Tfoot | `Th | `Thead | `Tr -> false 507 + | `Dd | `Dt | `Li -> false 508 + | `Optgroup | `Option -> false 509 + | `Param | `Source | `Track -> false 510 + | `Area -> false (* Only when descendant of map *) 511 + | `Rp | `Rt | `Rb | `Rtc -> false 512 + | `Legend | `Figcaption | `Summary -> false 513 + | _ -> true 514 + 515 + (** Pattern for matching HTML tags in element_tag *) 516 + let as_html_tag = function 517 + | Html tag -> Some tag 518 + | _ -> None 519 + 520 + (** Pattern for matching specific HTML tag *) 521 + let is_html_tag expected = function 522 + | Html tag -> tag = expected 523 + | _ -> false
+36 -31
lib/htmlrw_check/semantic/autofocus_checker.ml
··· 25 25 state.context_stack <- []; 26 26 state.current_depth <- 0 27 27 28 - let start_element state ~name ~namespace ~attrs collector = 28 + let start_element state ~element collector = 29 29 state.current_depth <- state.current_depth + 1; 30 30 31 - match namespace with 32 - | Some _ -> () 33 - | None -> 34 - let name_lower = String.lowercase_ascii name in 35 - 36 - (* Check if we're entering a dialog or popover context *) 37 - let enters_context = match name_lower with 38 - | "dialog" -> Some Dialog 39 - | _ when Attr_utils.has_attr "popover" attrs -> Some Popover 40 - | _ -> None 41 - in 42 - 43 - Option.iter (fun ctx_type -> 44 - let ctx = { context_type = ctx_type; autofocus_count = 0; depth = state.current_depth } in 31 + match element.Element.tag with 32 + | Tag.Html `Dialog -> 33 + let ctx = { context_type = Dialog; autofocus_count = 0; depth = state.current_depth } in 34 + state.context_stack <- ctx :: state.context_stack; 35 + (* Check for autofocus on dialog itself *) 36 + if Attr.has_autofocus element.attrs then 37 + begin match state.context_stack with 38 + | ctx :: _ -> 39 + ctx.autofocus_count <- ctx.autofocus_count + 1; 40 + if ctx.autofocus_count > 1 then 41 + Message_collector.add_typed collector (`Misc `Multiple_autofocus) 42 + | [] -> () 43 + end 44 + | Tag.Html _ -> 45 + (* Check if element has popover attribute *) 46 + let has_popover = Attr_utils.has_attr "popover" element.raw_attrs in 47 + if has_popover then begin 48 + let ctx = { context_type = Popover; autofocus_count = 0; depth = state.current_depth } in 45 49 state.context_stack <- ctx :: state.context_stack 46 - ) enters_context; 47 - 50 + end; 48 51 (* Check for autofocus attribute *) 49 - if Attr_utils.has_attr "autofocus" attrs then 52 + if Attr.has_autofocus element.attrs then begin 50 53 match state.context_stack with 51 54 | ctx :: _ -> 52 55 ctx.autofocus_count <- ctx.autofocus_count + 1; 53 56 if ctx.autofocus_count > 1 then 54 57 Message_collector.add_typed collector (`Misc `Multiple_autofocus) 55 58 | [] -> () 59 + end 60 + | _ -> () 56 61 57 - let end_element state ~name ~namespace _collector = 58 - (match namespace with 59 - | Some _ -> () 60 - | None -> 61 - let name_lower = String.lowercase_ascii name in 62 - match state.context_stack with 63 - | ctx :: rest when ctx.depth = state.current_depth -> 64 - let matches = 65 - (name_lower = "dialog" && ctx.context_type = Dialog) || 66 - (ctx.context_type = Popover) 67 - in 68 - if matches then state.context_stack <- rest 69 - | _ -> ()); 62 + let end_element state ~tag _collector = 63 + (match tag with 64 + | Tag.Html `Dialog -> 65 + (match state.context_stack with 66 + | ctx :: rest when ctx.depth = state.current_depth && ctx.context_type = Dialog -> 67 + state.context_stack <- rest 68 + | _ -> ()) 69 + | Tag.Html _ -> 70 + (match state.context_stack with 71 + | ctx :: rest when ctx.depth = state.current_depth && ctx.context_type = Popover -> 72 + state.context_stack <- rest 73 + | _ -> ()) 74 + | _ -> ()); 70 75 71 76 state.current_depth <- state.current_depth - 1 72 77
+6 -5
lib/htmlrw_check/semantic/form_checker.ml
··· 31 31 (`Attr (`Bad_value (`Elem element_name, `Attr "autocomplete", `Value value, `Reason reason))) 32 32 end 33 33 34 - let start_element _state ~name ~namespace:_ ~attrs collector = 34 + let start_element _state ~element collector = 35 35 (* Check autocomplete attribute on form elements *) 36 - match name with 37 - | "input" | "select" | "textarea" -> 38 - (match Attr_utils.get_attr "autocomplete" attrs with 36 + match element.Element.tag with 37 + | Tag.Html (`Input | `Select | `Textarea as tag) -> 38 + let name = Tag.html_tag_to_string tag in 39 + (match Attr_utils.get_attr "autocomplete" element.raw_attrs with 39 40 | Some autocomplete_value -> 40 41 check_autocomplete_value autocomplete_value name collector 41 42 | None -> ()) 42 43 | _ -> () 43 44 44 - let end_element _state ~name:_ ~namespace:_ _collector = () 45 + let end_element _state ~tag:_ _collector = () 45 46 46 47 let characters _state _text _collector = () 47 48
+12 -12
lib/htmlrw_check/semantic/id_checker.ml
··· 176 176 | _ -> () 177 177 ) attrs 178 178 179 - let start_element state ~name ~namespace:_ ~attrs collector = 180 - (* For now, we don't have location information from the DOM walker, 181 - so we pass None. In a full implementation, this would be passed 182 - from the parser. *) 179 + let start_element state ~element collector = 180 + let name = Tag.tag_to_string element.Element.tag in 181 + let attrs = element.raw_attrs in 183 182 let location = None in 184 183 process_attrs state ~element:name ~attrs ~location collector; 185 184 186 185 (* Special check: map element must have matching id and name if both present *) 187 - if name = "map" then begin 188 - let id_opt = List.find_map (fun (n, v) -> if n = "id" then Some v else None) attrs in 189 - let name_opt = List.find_map (fun (n, v) -> if n = "name" then Some v else None) attrs in 190 - match id_opt, name_opt with 191 - | Some id_val, Some name_val when id_val <> name_val -> 186 + (match element.tag with 187 + | Tag.Html `Map -> 188 + let id_opt = Attr.get_id element.attrs in 189 + let name_opt = Attr.get_name element.attrs in 190 + (match id_opt, name_opt with 191 + | Some id_val, Some name_val when id_val <> name_val -> 192 192 Message_collector.add_typed collector (`Misc `Map_id_name_mismatch) 193 - | _ -> () 194 - end 193 + | _ -> ()) 194 + | _ -> ()) 195 195 196 - let end_element _state ~name:_ ~namespace:_ _collector = 196 + let end_element _state ~tag:_ _collector = 197 197 () 198 198 199 199 let characters _state _text _collector =
+20 -17
lib/htmlrw_check/semantic/lang_detecting_checker.ml
··· 216 216 (* If > 2% are Traditional-only characters, it's Traditional Chinese *) 217 217 !total > 100 && (float_of_int !count /. float_of_int !total) > 0.02 218 218 219 - let start_element state ~name ~namespace ~attrs _collector = 220 - let name_lower = String.lowercase_ascii name in 221 - let ns = Option.value namespace ~default:"" in 222 - 223 - if name_lower = "html" then begin 219 + let start_element state ~element _collector = 220 + let attrs = element.Element.raw_attrs in 221 + match element.tag with 222 + | Tag.Html `Html -> 224 223 state.html_lang <- Attr_utils.get_attr "lang" attrs; 225 224 state.html_dir <- Attr_utils.get_attr "dir" attrs; 226 225 (* TODO: get line/column from locator *) 227 226 state.html_locator <- Some (1, 1) 228 - end 229 - else if name_lower = "body" then 227 + | Tag.Html `Body -> 230 228 state.in_body <- true 231 - else if state.in_body then begin 232 - (* Track foreign namespace depth (SVG/MathML) *) 233 - if is_foreign_namespace ns || is_foreign_element name then 229 + | Tag.Svg _ | Tag.MathML _ -> 230 + if state.in_body then 234 231 state.foreign_depth <- state.foreign_depth + 1 235 - else if state.foreign_depth > 0 then 232 + | Tag.Html tag when state.in_body -> 233 + let name_lower = Tag.html_tag_to_string tag in 234 + if state.foreign_depth > 0 then 236 235 state.foreign_depth <- state.foreign_depth + 1 237 236 (* Check if we should skip this element's text *) 238 237 else if List.mem name_lower skip_elements then ··· 244 243 state.skip_depth <- state.skip_depth + 1 245 244 | _ -> () 246 245 end 247 - end 246 + | _ -> () 248 247 249 - let end_element state ~name ~namespace:_ _collector = 250 - let name_lower = String.lowercase_ascii name in 251 - if name_lower = "body" then 248 + let end_element state ~tag _collector = 249 + match tag with 250 + | Tag.Html `Body -> 252 251 state.in_body <- false 253 - else if state.in_body then begin 252 + | Tag.Svg _ | Tag.MathML _ when state.in_body -> 253 + if state.foreign_depth > 0 then 254 + state.foreign_depth <- state.foreign_depth - 1 255 + | Tag.Html tag when state.in_body -> 256 + let name_lower = Tag.html_tag_to_string tag in 254 257 (* Track foreign namespace depth *) 255 258 if state.foreign_depth > 0 then 256 259 state.foreign_depth <- state.foreign_depth - 1 ··· 261 264 (* TODO: properly track nested elements with different lang *) 262 265 state.skip_depth <- max 0 (state.skip_depth - 1) 263 266 end 264 - end 267 + | _ -> () 265 268 266 269 let characters state text _collector = 267 270 if state.in_body && state.skip_depth = 0 && state.foreign_depth = 0 && state.char_count < max_chars then begin
+10 -8
lib/htmlrw_check/semantic/nesting_checker.ml
··· 300 300 end 301 301 | _ -> () 302 302 303 - let start_element state ~name ~namespace ~attrs collector = 303 + let start_element state ~element collector = 304 304 (* Only check HTML elements, not SVG or MathML *) 305 - match namespace with 306 - | Some _ -> () 307 - | None -> 305 + match element.Element.tag with 306 + | Tag.Html _ -> 307 + let name = Tag.tag_to_string element.tag in 308 + let attrs = element.raw_attrs in 308 309 (* Check for nesting violations *) 309 310 check_nesting state name attrs collector; 310 311 check_required_ancestors state name collector; ··· 334 335 let node = { ancestor_mask = state.ancestor_mask; name; is_transparent } in 335 336 state.stack <- node :: state.stack; 336 337 state.ancestor_mask <- new_mask 338 + | _ -> () (* SVG, MathML, Custom, Unknown *) 337 339 338 - let end_element state ~name:_ ~namespace _collector = 340 + let end_element state ~tag _collector = 339 341 (* Only track HTML elements *) 340 - match namespace with 341 - | Some _ -> () 342 - | None -> 342 + match tag with 343 + | Tag.Html _ -> 343 344 (* Pop from stack and restore ancestor mask *) 344 345 begin match state.stack with 345 346 | [] -> () (* Should not happen in well-formed documents *) ··· 347 348 state.stack <- rest; 348 349 state.ancestor_mask <- node.ancestor_mask 349 350 end 351 + | _ -> () 350 352 351 353 let characters _state _text _collector = 352 354 () (* No text-specific nesting checks *)
+11 -13
lib/htmlrw_check/semantic/obsolete_checker.ml
··· 250 250 251 251 let reset state = state.in_head <- false 252 252 253 - let start_element state ~name ~namespace ~attrs collector = 254 - (* Only check HTML elements (no namespace or explicit HTML namespace) *) 255 - let is_html = match namespace with 256 - | None -> true 257 - | Some ns -> String.equal (String.lowercase_ascii ns) "html" 258 - in 259 - 260 - if not is_html then () 261 - else begin 253 + let start_element state ~element collector = 254 + (* Only check HTML elements *) 255 + match element.Element.tag with 256 + | Tag.Html _ -> 257 + let name = Tag.tag_to_string element.tag in 262 258 let name_lower = String.lowercase_ascii name in 259 + let attrs = element.raw_attrs in 263 260 264 261 (* Track head context *) 265 262 if name_lower = "head" then state.in_head <- true; ··· 309 306 (`Element (`Obsolete_global_attr (`Attr attr_name, `Suggestion suggestion)))) 310 307 end 311 308 ) attrs 312 - end 309 + | _ -> () (* Non-HTML elements don't have obsolete checks *) 313 310 314 - let end_element state ~name ~namespace:_ _collector = 315 - let name_lower = String.lowercase_ascii name in 316 - if name_lower = "head" then state.in_head <- false 311 + let end_element state ~tag _collector = 312 + match tag with 313 + | Tag.Html `Head -> state.in_head <- false 314 + | _ -> () 317 315 318 316 let characters _state _text _collector = () 319 317
+30 -41
lib/htmlrw_check/semantic/option_checker.ml
··· 22 22 state.option_stack <- []; 23 23 state.in_template <- 0 24 24 25 - let start_element state ~name ~namespace ~attrs collector = 26 - let name_lower = String.lowercase_ascii name in 25 + let start_element state ~element collector = 26 + match element.Element.tag with 27 + | Tag.Html `Template -> 28 + state.in_template <- state.in_template + 1 29 + | Tag.Html `Option when state.in_template = 0 -> 30 + let label_opt = Attr_utils.get_attr "label" element.raw_attrs in 31 + let has_label = label_opt <> None in 32 + let label_empty = match label_opt with 33 + | Some v -> String.trim v = "" 34 + | None -> false 35 + in 36 + (* Report error for empty label attribute value *) 37 + if label_empty then 38 + Message_collector.add_typed collector 39 + (`Attr (`Bad_value (`Elem "option", `Attr "label", `Value "", `Reason "Bad non-empty string: Must not be empty."))); 40 + let ctx = { has_text = false; has_label; label_empty } in 41 + state.option_stack <- ctx :: state.option_stack 42 + | _ -> () 27 43 28 - if namespace <> None then () 29 - else begin 30 - if name_lower = "template" then 31 - state.in_template <- state.in_template + 1 32 - else if state.in_template = 0 && name_lower = "option" then begin 33 - let label_opt = Attr_utils.get_attr "label" attrs in 34 - let has_label = label_opt <> None in 35 - let label_empty = match label_opt with 36 - | Some v -> String.trim v = "" 37 - | None -> false 38 - in 39 - (* Report error for empty label attribute value *) 40 - if label_empty then 41 - Message_collector.add_typed collector 42 - (`Attr (`Bad_value (`Elem "option", `Attr "label", `Value "", `Reason "Bad non-empty string: Must not be empty."))); 43 - let ctx = { has_text = false; has_label; label_empty } in 44 - state.option_stack <- ctx :: state.option_stack 45 - end 46 - end 47 - 48 - let end_element state ~name ~namespace collector = 49 - let name_lower = String.lowercase_ascii name in 50 - 51 - if namespace <> None then () 52 - else begin 53 - if name_lower = "template" then 54 - state.in_template <- max 0 (state.in_template - 1) 55 - else if state.in_template = 0 && name_lower = "option" then begin 56 - match state.option_stack with 57 - | ctx :: rest -> 58 - state.option_stack <- rest; 59 - (* Validate: option must have text content or non-empty label *) 60 - (* Note: empty label error is already reported at start_element, 61 - so only report empty option without label when there's no label attribute at all *) 62 - if not ctx.has_text && not ctx.has_label then 63 - Message_collector.add_typed collector (`Misc `Option_empty_without_label) 64 - | [] -> () 65 - end 66 - end 44 + let end_element state ~tag collector = 45 + match tag with 46 + | Tag.Html `Template -> 47 + state.in_template <- max 0 (state.in_template - 1) 48 + | Tag.Html `Option when state.in_template = 0 -> 49 + (match state.option_stack with 50 + | ctx :: rest -> 51 + state.option_stack <- rest; 52 + if not ctx.has_text && not ctx.has_label then 53 + Message_collector.add_typed collector (`Misc `Option_empty_without_label) 54 + | [] -> ()) 55 + | _ -> () 67 56 68 57 let characters state text _collector = 69 58 if state.in_template = 0 then begin
+23 -20
lib/htmlrw_check/semantic/required_attr_checker.ml
··· 177 177 (q "value") (q "max"))) 178 178 with _ -> ()) 179 179 180 - let start_element state ~name ~namespace:_ ~attrs collector = 181 - match name with 182 - | "img" -> check_img_element state attrs collector 183 - | "area" -> check_area_element attrs collector 184 - | "input" -> check_input_element attrs collector 185 - | "script" -> check_script_element attrs collector 186 - | "meta" -> check_meta_element attrs collector 187 - | "link" -> check_link_element attrs collector 188 - | "a" -> 180 + let start_element state ~element collector = 181 + let attrs = element.Element.raw_attrs in 182 + match element.tag with 183 + | Tag.Html `Img -> check_img_element state attrs collector 184 + | Tag.Html `Area -> check_area_element attrs collector 185 + | Tag.Html `Input -> check_input_element attrs collector 186 + | Tag.Html `Script -> check_script_element attrs collector 187 + | Tag.Html `Meta -> check_meta_element attrs collector 188 + | Tag.Html `Link -> check_link_element attrs collector 189 + | Tag.Html `A -> 189 190 check_a_element attrs collector; 190 191 if Attr_utils.has_attr "href" attrs then state.in_a_with_href <- true 191 - | "map" -> check_map_element attrs collector 192 - | "object" -> check_object_element attrs collector 193 - | "meter" -> check_meter_element attrs collector 194 - | "progress" -> check_progress_element attrs collector 195 - | "figure" -> state._in_figure <- true 196 - | _ -> 197 - (* Check popover attribute on any element *) 192 + | Tag.Html `Map -> check_map_element attrs collector 193 + | Tag.Html `Object -> check_object_element attrs collector 194 + | Tag.Html `Meter -> check_meter_element attrs collector 195 + | Tag.Html `Progress -> check_progress_element attrs collector 196 + | Tag.Html `Figure -> state._in_figure <- true 197 + | Tag.Html _ -> 198 + (* Check popover attribute on any HTML element *) 199 + let name = Tag.tag_to_string element.tag in 198 200 if Attr_utils.has_attr "popover" attrs then check_popover_element name attrs collector 201 + | _ -> () (* Non-HTML elements *) 199 202 200 - let end_element state ~name ~namespace:_ _collector = 201 - match name with 202 - | "figure" -> state._in_figure <- false 203 - | "a" -> state.in_a_with_href <- false 203 + let end_element state ~tag _collector = 204 + match tag with 205 + | Tag.Html `Figure -> state._in_figure <- false 206 + | Tag.Html `A -> state.in_a_with_href <- false 204 207 | _ -> () 205 208 206 209 let characters _state _text _collector = ()
+13 -11
lib/htmlrw_check/specialized/aria_checker.ml
··· 427 427 let quoted = List.map (fun r -> "\"" ^ r ^ "\"") roles in 428 428 String.concat " or " quoted 429 429 430 - let start_element state ~name ~namespace ~attrs collector = 430 + let start_element state ~element collector = 431 431 (* Only process HTML elements *) 432 - match namespace with 433 - | Some _ -> () (* Skip non-HTML elements *) 434 - | None -> 432 + match element.Element.tag with 433 + | Tag.Html _ -> 434 + let name = Tag.tag_to_string element.tag in 435 435 let name_lower = String.lowercase_ascii name in 436 + let attrs = element.raw_attrs in 436 437 let role_attr = List.assoc_opt "role" attrs in 437 438 let aria_label = List.assoc_opt "aria-label" attrs in 438 439 let aria_labelledby = List.assoc_opt "aria-labelledby" attrs in ··· 723 724 implicit_role; 724 725 } in 725 726 state.stack <- node :: state.stack 727 + | _ -> () (* Skip non-HTML elements *) 726 728 727 - let end_element state ~name:_ ~namespace _collector = 729 + let end_element state ~tag _collector = 728 730 (* Only process HTML elements *) 729 - match namespace with 730 - | Some _ -> () (* Skip non-HTML elements *) 731 - | None -> 731 + match tag with 732 + | Tag.Html _ -> 732 733 (* Pop from stack *) 733 - match state.stack with 734 - | _ :: rest -> state.stack <- rest 735 - | [] -> () (* Stack underflow - shouldn't happen in well-formed docs *) 734 + (match state.stack with 735 + | _ :: rest -> state.stack <- rest 736 + | [] -> ()) (* Stack underflow - shouldn't happen in well-formed docs *) 737 + | _ -> () 736 738 737 739 let characters _state _text _collector = () 738 740
+255 -299
lib/htmlrw_check/specialized/attr_restrictions_checker.ml
··· 52 52 Message_collector.add_typed collector 53 53 (`Attr (`Not_allowed (`Attr attr, `Elem element))) 54 54 55 - let start_element state ~name ~namespace ~attrs collector = 56 - let name_lower = String.lowercase_ascii name in 55 + let start_element state ~element collector = 56 + match element.Element.tag with 57 + | Tag.Html _ -> 58 + let name = Tag.tag_to_string element.tag in 59 + let name_lower = String.lowercase_ascii name in 60 + let attrs = element.raw_attrs in 57 61 58 - (* Detect XHTML mode from xmlns attribute on html element *) 59 - if name_lower = "html" then begin 60 - match Attr_utils.get_attr "xmlns" attrs with 61 - | Some "http://www.w3.org/1999/xhtml" -> state.is_xhtml <- true 62 - | _ -> () 63 - end; 62 + (* Detect XHTML mode from xmlns attribute on html element *) 63 + if name_lower = "html" then begin 64 + match Attr_utils.get_attr "xmlns" attrs with 65 + | Some "http://www.w3.org/1999/xhtml" -> state.is_xhtml <- true 66 + | _ -> () 67 + end; 64 68 65 - (* Check HTML element attribute restrictions *) 66 - (match namespace with 67 - | Some _ -> () 68 - | None -> 69 - match List.assoc_opt name_lower disallowed_attrs_html with 69 + (* Check HTML element attribute restrictions *) 70 + (match List.assoc_opt name_lower disallowed_attrs_html with 70 71 | Some disallowed -> 71 72 List.iter (fun attr -> 72 73 if Attr_utils.has_attr attr attrs then ··· 74 75 ) disallowed 75 76 | None -> ()); 76 77 77 - (* Check for xml:base attribute - not allowed in HTML *) 78 - (match namespace with 79 - | Some _ -> () 80 - | None when name_lower = "html" -> 81 - if Attr_utils.has_attr "xml:base" attrs then 82 - report_disallowed_attr name_lower "xml:base" collector 83 - | None -> ()); 78 + (* Check for xml:base attribute - not allowed in HTML *) 79 + if name_lower = "html" then begin 80 + if Attr_utils.has_attr "xml:base" attrs then 81 + report_disallowed_attr name_lower "xml:base" collector 82 + end; 84 83 85 - (* Check for xmlns:* prefixed attributes - not allowed in HTML *) 86 - (* Standard xmlns declarations are allowed but custom prefixes are not *) 87 - (match namespace with 88 - | Some _ -> () 89 - | None -> 84 + (* Check for xmlns:* prefixed attributes - not allowed in HTML *) 85 + (* Standard xmlns declarations are allowed but custom prefixes are not *) 90 86 List.iter (fun (attr_name, _) -> 91 87 let attr_lower = String.lowercase_ascii attr_name in 92 88 if String.starts_with ~prefix:"xmlns:" attr_lower then begin ··· 96 92 Message_collector.add_typed collector 97 93 (`Attr (`Not_allowed_here (`Attr attr_name))) 98 94 end 99 - ) attrs); 95 + ) attrs; 96 + 97 + (* Check SVG element restrictions - works in both HTML-embedded and XHTML SVG *) 98 + (* xml:id is never valid on SVG elements in HTML5 *) 99 + if List.mem name_lower svg_no_xml_id then begin 100 + if Attr_utils.has_attr "xml:id" attrs then 101 + report_disallowed_attr name_lower "xml:id" collector 102 + end; 100 103 101 - (* Check SVG element restrictions - works in both HTML-embedded and XHTML SVG *) 102 - (* xml:id is never valid on SVG elements in HTML5 *) 103 - if List.mem name_lower svg_no_xml_id then begin 104 - if Attr_utils.has_attr "xml:id" attrs then 105 - report_disallowed_attr name_lower "xml:id" collector 106 - end; 104 + (* SVG feConvolveMatrix requires order attribute *) 105 + if name_lower = "feconvolvematrix" then begin 106 + if not (Attr_utils.has_attr "order" attrs) then 107 + Message_collector.add_typed collector 108 + (`Svg (`Missing_attr (`Elem "feConvolveMatrix", `Attr "order"))) 109 + end; 107 110 108 - (* SVG feConvolveMatrix requires order attribute *) 109 - if name_lower = "feconvolvematrix" then begin 110 - if not (Attr_utils.has_attr "order" attrs) then 111 - Message_collector.add_typed collector 112 - (`Svg (`Missing_attr (`Elem "feConvolveMatrix", `Attr "order"))) 113 - end; 111 + (* Validate style type attribute - must be "text/css" or omitted *) 112 + if name_lower = "style" then begin 113 + List.iter (fun (attr_name, attr_value) -> 114 + let attr_lower = String.lowercase_ascii attr_name in 115 + if attr_lower = "type" then begin 116 + let value_lower = String.lowercase_ascii (String.trim attr_value) in 117 + if value_lower <> "text/css" then 118 + Message_collector.add_typed collector (`Misc `Style_type_invalid) 119 + end 120 + ) attrs 121 + end; 114 122 115 - (* Validate style type attribute - must be "text/css" or omitted *) 116 - (match namespace with 117 - | Some _ -> () 118 - | None when name_lower = "style" -> 119 - List.iter (fun (attr_name, attr_value) -> 120 - let attr_lower = String.lowercase_ascii attr_name in 121 - if attr_lower = "type" then begin 122 - let value_lower = String.lowercase_ascii (String.trim attr_value) in 123 - if value_lower <> "text/css" then 124 - Message_collector.add_typed collector (`Misc `Style_type_invalid) 125 - end 126 - ) attrs 127 - | None -> ()); 123 + (* Validate object element requires data or type attribute *) 124 + if name_lower = "object" then begin 125 + let has_data = Attr_utils.has_attr "data" attrs in 126 + let has_type = Attr_utils.has_attr "type" attrs in 127 + if not has_data && not has_type then 128 + Message_collector.add_typed collector 129 + (`Attr (`Missing (`Elem "object", `Attr "data"))) 130 + end; 128 131 129 - (* Validate object element requires data or type attribute *) 130 - (match namespace with 131 - | Some _ -> () 132 - | None when name_lower = "object" -> 133 - let has_data = Attr_utils.has_attr "data" attrs in 134 - let has_type = Attr_utils.has_attr "type" attrs in 135 - if not has_data && not has_type then 136 - Message_collector.add_typed collector 137 - (`Attr (`Missing (`Elem "object", `Attr "data"))) 138 - | None -> ()); 132 + (* Validate link imagesizes/imagesrcset attributes *) 133 + if name_lower = "link" then begin 134 + let has_imagesizes = Attr_utils.has_attr "imagesizes" attrs in 135 + let has_imagesrcset = Attr_utils.has_attr "imagesrcset" attrs in 136 + let rel_value = Attr_utils.get_attr "rel" attrs in 137 + let as_value = Attr_utils.get_attr "as" attrs in 139 138 140 - (* Validate link imagesizes/imagesrcset attributes *) 141 - (match namespace with 142 - | Some _ -> () 143 - | None when name_lower = "link" -> 144 - let has_imagesizes = Attr_utils.has_attr "imagesizes" attrs in 145 - let has_imagesrcset = Attr_utils.has_attr "imagesrcset" attrs in 146 - let rel_value = Attr_utils.get_attr "rel" attrs in 147 - let as_value = Attr_utils.get_attr "as" attrs in 139 + (* imagesizes requires imagesrcset *) 140 + if has_imagesizes && not has_imagesrcset then 141 + Message_collector.add_typed collector (`Srcset `Imagesizes_without_imagesrcset); 148 142 149 - (* imagesizes requires imagesrcset *) 150 - if has_imagesizes && not has_imagesrcset then 151 - Message_collector.add_typed collector (`Srcset `Imagesizes_without_imagesrcset); 143 + (* imagesrcset requires as="image" *) 144 + if has_imagesrcset then begin 145 + let as_is_image = match as_value with 146 + | Some v -> String.lowercase_ascii (String.trim v) = "image" 147 + | None -> false 148 + in 149 + if not as_is_image then 150 + Message_collector.add_typed collector (`Link `Imagesrcset_requires_as_image) 151 + end; 152 152 153 - (* imagesrcset requires as="image" *) 154 - if has_imagesrcset then begin 155 - let as_is_image = match as_value with 156 - | Some v -> String.lowercase_ascii (String.trim v) = "image" 157 - | None -> false 158 - in 159 - if not as_is_image then 160 - Message_collector.add_typed collector (`Link `Imagesrcset_requires_as_image) 153 + (* as attribute requires rel="preload" or rel="modulepreload" *) 154 + (match as_value with 155 + | Some _ -> 156 + let rel_is_preload = match rel_value with 157 + | Some v -> 158 + let rel_lower = String.lowercase_ascii (String.trim v) in 159 + String.length rel_lower > 0 && 160 + (List.mem "preload" (String.split_on_char ' ' rel_lower) || 161 + List.mem "modulepreload" (String.split_on_char ' ' rel_lower)) 162 + | None -> false 163 + in 164 + if not rel_is_preload then 165 + Message_collector.add_typed collector (`Link `As_requires_preload) 166 + | None -> ()) 161 167 end; 162 168 163 - (* as attribute requires rel="preload" or rel="modulepreload" *) 164 - (match as_value with 165 - | Some _ -> 166 - let rel_is_preload = match rel_value with 167 - | Some v -> 168 - let rel_lower = String.lowercase_ascii (String.trim v) in 169 - String.length rel_lower > 0 && 170 - (List.mem "preload" (String.split_on_char ' ' rel_lower) || 171 - List.mem "modulepreload" (String.split_on_char ' ' rel_lower)) 172 - | None -> false 173 - in 174 - if not rel_is_preload then 175 - Message_collector.add_typed collector (`Link `As_requires_preload) 176 - | None -> ()) 177 - | None -> ()); 178 - 179 - (* Validate img usemap attribute - must be hash-name reference with content *) 180 - (match namespace with 181 - | Some _ -> () 182 - | None when name_lower = "img" -> 183 - List.iter (fun (attr_name, attr_value) -> 184 - let attr_lower = String.lowercase_ascii attr_name in 185 - if attr_lower = "usemap" then begin 186 - if attr_value = "#" then 187 - Message_collector.add_typed collector 188 - (`Attr (`Bad_value_generic (`Message (Printf.sprintf 189 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad hash-name reference: A hash-name reference must have at least one character after \xe2\x80\x9c#\xe2\x80\x9d." 190 - attr_value attr_name name)))) 191 - end 192 - ) attrs 193 - | None -> ()); 169 + (* Validate img usemap attribute - must be hash-name reference with content *) 170 + if name_lower = "img" then begin 171 + List.iter (fun (attr_name, attr_value) -> 172 + let attr_lower = String.lowercase_ascii attr_name in 173 + if attr_lower = "usemap" then begin 174 + if attr_value = "#" then 175 + Message_collector.add_typed collector 176 + (`Attr (`Bad_value_generic (`Message (Printf.sprintf 177 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad hash-name reference: A hash-name reference must have at least one character after \xe2\x80\x9c#\xe2\x80\x9d." 178 + attr_value attr_name name)))) 179 + end 180 + ) attrs 181 + end; 194 182 195 - (* Validate embed type attribute - must be valid MIME type *) 196 - (match namespace with 197 - | Some _ -> () 198 - | None when name_lower = "embed" -> 199 - List.iter (fun (attr_name, attr_value) -> 200 - let attr_lower = String.lowercase_ascii attr_name in 201 - if attr_lower = "type" then begin 202 - match Dt_mime.validate_mime_type attr_value with 203 - | Ok () -> () 204 - | Error msg -> 205 - Message_collector.add_typed collector 206 - (`Attr (`Bad_value_generic (`Message (Printf.sprintf 207 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: %s" 208 - attr_value attr_name name msg)))) 209 - end 210 - ) attrs 211 - | None -> ()); 183 + (* Validate embed type attribute - must be valid MIME type *) 184 + if name_lower = "embed" then begin 185 + List.iter (fun (attr_name, attr_value) -> 186 + let attr_lower = String.lowercase_ascii attr_name in 187 + if attr_lower = "type" then begin 188 + match Dt_mime.validate_mime_type attr_value with 189 + | Ok () -> () 190 + | Error msg -> 191 + Message_collector.add_typed collector 192 + (`Attr (`Bad_value_generic (`Message (Printf.sprintf 193 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: %s" 194 + attr_value attr_name name msg)))) 195 + end 196 + ) attrs 197 + end; 212 198 213 - (* Validate width/height on embed and img - must be non-negative integers *) 214 - let is_dimension_element = name_lower = "embed" || name_lower = "img" || 215 - name_lower = "video" || name_lower = "canvas" || 216 - name_lower = "iframe" || name_lower = "source" in 217 - (match namespace with 218 - | Some _ -> () 219 - | None when is_dimension_element -> 220 - List.iter (fun (attr_name, attr_value) -> 221 - let attr_lower = String.lowercase_ascii attr_name in 222 - if attr_lower = "width" || attr_lower = "height" then begin 223 - (* Check for non-negative integer only *) 224 - let is_valid = 225 - String.length attr_value > 0 && 226 - String.for_all (fun c -> c >= '0' && c <= '9') attr_value 227 - in 228 - if not is_valid then begin 229 - (* Determine specific error message *) 230 - let error_msg = 231 - if String.length attr_value = 0 then 232 - Printf.sprintf "Bad value \xe2\x80\x9c\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The empty string is not a valid non-negative integer." 233 - attr_name name 234 - else if String.contains attr_value '%' then 235 - Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%%\xe2\x80\x9d instead." 236 - attr_value attr_name name 237 - else if String.length attr_value > 0 && attr_value.[0] = '-' then 238 - Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c-\xe2\x80\x9d instead." 239 - attr_value attr_name name 240 - else 241 - (* Find first non-digit character *) 242 - let bad_char = 243 - try 244 - let i = ref 0 in 245 - while !i < String.length attr_value && attr_value.[!i] >= '0' && attr_value.[!i] <= '9' do 246 - incr i 247 - done; 248 - if !i < String.length attr_value then Some attr_value.[!i] else None 249 - with _ -> None 250 - in 251 - match bad_char with 252 - | Some c -> 253 - Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%c\xe2\x80\x9d instead." 254 - attr_value attr_name name c 255 - | None -> 256 - Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit." 199 + (* Validate width/height on embed and img - must be non-negative integers *) 200 + let is_dimension_element = name_lower = "embed" || name_lower = "img" || 201 + name_lower = "video" || name_lower = "canvas" || 202 + name_lower = "iframe" || name_lower = "source" in 203 + if is_dimension_element then begin 204 + List.iter (fun (attr_name, attr_value) -> 205 + let attr_lower = String.lowercase_ascii attr_name in 206 + if attr_lower = "width" || attr_lower = "height" then begin 207 + (* Check for non-negative integer only *) 208 + let is_valid = 209 + String.length attr_value > 0 && 210 + String.for_all (fun c -> c >= '0' && c <= '9') attr_value 211 + in 212 + if not is_valid then begin 213 + (* Determine specific error message *) 214 + let error_msg = 215 + if String.length attr_value = 0 then 216 + Printf.sprintf "Bad value \xe2\x80\x9c\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The empty string is not a valid non-negative integer." 217 + attr_name name 218 + else if String.contains attr_value '%' then 219 + Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%%\xe2\x80\x9d instead." 257 220 attr_value attr_name name 258 - in 259 - Message_collector.add_typed collector 260 - (`Attr (`Bad_value_generic (`Message error_msg))) 221 + else if String.length attr_value > 0 && attr_value.[0] = '-' then 222 + Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c-\xe2\x80\x9d instead." 223 + attr_value attr_name name 224 + else 225 + (* Find first non-digit character *) 226 + let bad_char = 227 + try 228 + let i = ref 0 in 229 + while !i < String.length attr_value && attr_value.[!i] >= '0' && attr_value.[!i] <= '9' do 230 + incr i 231 + done; 232 + if !i < String.length attr_value then Some attr_value.[!i] else None 233 + with _ -> None 234 + in 235 + match bad_char with 236 + | Some c -> 237 + Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%c\xe2\x80\x9d instead." 238 + attr_value attr_name name c 239 + | None -> 240 + Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit." 241 + attr_value attr_name name 242 + in 243 + Message_collector.add_typed collector 244 + (`Attr (`Bad_value_generic (`Message error_msg))) 245 + end 261 246 end 262 - end 263 - ) attrs 264 - | None -> ()); 247 + ) attrs 248 + end; 265 249 266 - (* Validate area[shape=default] cannot have coords *) 267 - (match namespace with 268 - | Some _ -> () 269 - | None when name_lower = "area" -> 270 - (match Attr_utils.get_attr "shape" attrs with 271 - | Some s when String.lowercase_ascii (String.trim s) = "default" -> 272 - if Attr_utils.has_attr "coords" attrs then 273 - Message_collector.add_typed collector 274 - (`Attr (`Not_allowed (`Attr "coords", `Elem "area"))) 275 - | _ -> ()) 276 - | None -> ()); 250 + (* Validate area[shape=default] cannot have coords *) 251 + if name_lower = "area" then begin 252 + match Attr_utils.get_attr "shape" attrs with 253 + | Some s when String.lowercase_ascii (String.trim s) = "default" -> 254 + if Attr_utils.has_attr "coords" attrs then 255 + Message_collector.add_typed collector 256 + (`Attr (`Not_allowed (`Attr "coords", `Elem "area"))) 257 + | _ -> () 258 + end; 277 259 278 - (* Validate bdo element requires dir attribute, and dir cannot be "auto" *) 279 - (match namespace with 280 - | Some _ -> () 281 - | None when name_lower = "bdo" -> 282 - (match Attr_utils.get_attr "dir" attrs with 283 - | None -> 284 - Message_collector.add_typed collector (`Misc `Bdo_missing_dir) 285 - | Some v when String.lowercase_ascii (String.trim v) = "auto" -> 286 - Message_collector.add_typed collector (`Misc `Bdo_dir_auto) 287 - | _ -> ()) 288 - | None -> ()); 260 + (* Validate bdo element requires dir attribute, and dir cannot be "auto" *) 261 + if name_lower = "bdo" then begin 262 + match Attr_utils.get_attr "dir" attrs with 263 + | None -> 264 + Message_collector.add_typed collector (`Misc `Bdo_missing_dir) 265 + | Some v when String.lowercase_ascii (String.trim v) = "auto" -> 266 + Message_collector.add_typed collector (`Misc `Bdo_dir_auto) 267 + | _ -> () 268 + end; 289 269 290 - (* Validate input list attribute - only allowed for certain types *) 291 - (match namespace with 292 - | Some _ -> () 293 - | None when name_lower = "input" -> 294 - if Attr_utils.has_attr "list" attrs then begin 295 - let input_type = Attr_utils.get_attr_or "type" ~default:"text" attrs 296 - |> String.trim |> String.lowercase_ascii in 297 - if not (List.mem input_type input_types_allowing_list) then 298 - Message_collector.add_typed collector (`Input `List_not_allowed) 299 - end 300 - | None -> ()); 270 + (* Validate input list attribute - only allowed for certain types *) 271 + if name_lower = "input" then begin 272 + if Attr_utils.has_attr "list" attrs then begin 273 + let input_type = Attr_utils.get_attr_or "type" ~default:"text" attrs 274 + |> String.trim |> String.lowercase_ascii in 275 + if not (List.mem input_type input_types_allowing_list) then 276 + Message_collector.add_typed collector (`Input `List_not_allowed) 277 + end 278 + end; 301 279 302 - (* Validate data-* attributes *) 303 - (match namespace with 304 - | Some _ -> () 305 - | None -> 280 + (* Validate data-* attributes *) 306 281 List.iter (fun (attr_name, _) -> 307 282 let attr_lower = String.lowercase_ascii attr_name in 308 283 (* Check if it starts with "data-" *) ··· 316 291 Message_collector.add_typed collector 317 292 (`Attr (`Data_invalid_name (`Reason "must be XML 1.0 4th ed. plus Namespaces NCNames"))) 318 293 end 319 - ) attrs); 294 + ) attrs; 320 295 321 - (* Validate xml:lang must have matching lang attribute - only in HTML mode, not XHTML *) 322 - (match namespace with 323 - | Some _ -> () 324 - | None when not state.is_xhtml -> 325 - let xmllang_value = Attr_utils.get_attr "xml:lang" attrs in 326 - let lang_value = Attr_utils.get_attr "lang" attrs in 327 - (match xmllang_value with 328 - | Some xmllang -> 329 - (match lang_value with 330 - | None -> 331 - Message_collector.add_typed collector (`I18n `Xml_lang_without_lang) 332 - | Some lang when String.lowercase_ascii lang <> String.lowercase_ascii xmllang -> 333 - Message_collector.add_typed collector (`I18n `Xml_lang_without_lang) 334 - | _ -> ()) 335 - | None -> ()) 336 - | None -> ()); 296 + (* Validate xml:lang must have matching lang attribute - only in HTML mode, not XHTML *) 297 + if not state.is_xhtml then begin 298 + let xmllang_value = Attr_utils.get_attr "xml:lang" attrs in 299 + let lang_value = Attr_utils.get_attr "lang" attrs in 300 + match xmllang_value with 301 + | Some xmllang -> 302 + (match lang_value with 303 + | None -> 304 + Message_collector.add_typed collector (`I18n `Xml_lang_without_lang) 305 + | Some lang when String.lowercase_ascii lang <> String.lowercase_ascii xmllang -> 306 + Message_collector.add_typed collector (`I18n `Xml_lang_without_lang) 307 + | _ -> ()) 308 + | None -> () 309 + end; 337 310 338 - (* Validate spellcheck attribute - must be "true" or "false" or empty *) 339 - (match namespace with 340 - | Some _ -> () 341 - | None -> 311 + (* Validate spellcheck attribute - must be "true" or "false" or empty *) 342 312 List.iter (fun (attr_name, attr_value) -> 343 313 let attr_lower = String.lowercase_ascii attr_name in 344 314 if attr_lower = "spellcheck" then begin ··· 347 317 Message_collector.add_typed collector 348 318 (`Attr (`Bad_value (`Elem name, `Attr attr_name, `Value attr_value, `Reason ""))) 349 319 end 350 - ) attrs); 320 + ) attrs; 351 321 352 - (* Validate enterkeyhint attribute - must be one of specific values *) 353 - (match namespace with 354 - | Some _ -> () 355 - | None -> 322 + (* Validate enterkeyhint attribute - must be one of specific values *) 356 323 let valid_enterkeyhint = ["enter"; "done"; "go"; "next"; "previous"; "search"; "send"] in 357 324 List.iter (fun (attr_name, attr_value) -> 358 325 let attr_lower = String.lowercase_ascii attr_name in ··· 362 329 Message_collector.add_typed collector 363 330 (`Attr (`Bad_value (`Elem name, `Attr attr_name, `Value attr_value, `Reason ""))) 364 331 end 365 - ) attrs); 332 + ) attrs; 366 333 367 - (* Validate headingoffset attribute - must be a number between 0 and 8 *) 368 - (match namespace with 369 - | Some _ -> () 370 - | None -> 334 + (* Validate headingoffset attribute - must be a number between 0 and 8 *) 371 335 List.iter (fun (attr_name, attr_value) -> 372 336 let attr_lower = String.lowercase_ascii attr_name in 373 337 if attr_lower = "headingoffset" then begin ··· 383 347 if not is_valid then 384 348 Message_collector.add_typed collector (`Misc `Headingoffset_invalid) 385 349 end 386 - ) attrs); 350 + ) attrs; 387 351 388 - (* Validate accesskey attribute - each key label must be a single code point *) 389 - (match namespace with 390 - | Some _ -> () 391 - | None -> 352 + (* Validate accesskey attribute - each key label must be a single code point *) 392 353 List.iter (fun (attr_name, attr_value) -> 393 354 let attr_lower = String.lowercase_ascii attr_name in 394 355 if attr_lower = "accesskey" then begin ··· 433 394 in 434 395 find_duplicates [] keys 435 396 end 436 - ) attrs); 397 + ) attrs; 437 398 438 - (* Validate that command and popovertarget cannot have aria-expanded *) 439 - (match namespace with 440 - | Some _ -> () 441 - | None when name_lower = "button" -> 442 - let has_command = Attr_utils.has_attr "command" attrs in 443 - let has_popovertarget = Attr_utils.has_attr "popovertarget" attrs in 444 - let has_aria_expanded = Attr_utils.has_attr "aria-expanded" attrs in 399 + (* Validate that command and popovertarget cannot have aria-expanded *) 400 + if name_lower = "button" then begin 401 + let has_command = Attr_utils.has_attr "command" attrs in 402 + let has_popovertarget = Attr_utils.has_attr "popovertarget" attrs in 403 + let has_aria_expanded = Attr_utils.has_attr "aria-expanded" attrs in 445 404 446 - if has_command && has_aria_expanded then 447 - Message_collector.add_typed collector 448 - (`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name, 449 - `Condition "a \xe2\x80\x9ccommand\xe2\x80\x9d attribute"))); 405 + if has_command && has_aria_expanded then 406 + Message_collector.add_typed collector 407 + (`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name, 408 + `Condition "a \xe2\x80\x9ccommand\xe2\x80\x9d attribute"))); 450 409 451 - if has_popovertarget && has_aria_expanded then 452 - Message_collector.add_typed collector 453 - (`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name, 454 - `Condition "a \xe2\x80\x9cpopovertarget\xe2\x80\x9d attribute"))) 455 - | None -> ()); 410 + if has_popovertarget && has_aria_expanded then 411 + Message_collector.add_typed collector 412 + (`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name, 413 + `Condition "a \xe2\x80\x9cpopovertarget\xe2\x80\x9d attribute"))) 414 + end; 456 415 457 - (* Note: data-* uppercase check requires XML parsing which preserves case. 458 - The HTML5 parser normalizes attribute names to lowercase, so this check 459 - is only effective when the document is parsed as XML. 460 - Commenting out until we have XML parsing support. *) 461 - ignore state.is_xhtml; 416 + (* Note: data-* uppercase check requires XML parsing which preserves case. 417 + The HTML5 parser normalizes attribute names to lowercase, so this check 418 + is only effective when the document is parsed as XML. 419 + Commenting out until we have XML parsing support. *) 420 + ignore state.is_xhtml; 462 421 463 - (* Validate media attribute on link, style, source elements *) 464 - let is_media_element = name_lower = "link" || name_lower = "style" || name_lower = "source" in 465 - (match namespace with 466 - | Some _ -> () 467 - | None when is_media_element -> 468 - List.iter (fun (attr_name, attr_value) -> 469 - let attr_lower = String.lowercase_ascii attr_name in 470 - if attr_lower = "media" then begin 471 - let trimmed = String.trim attr_value in 472 - if trimmed <> "" then begin 473 - match Dt_media_query.validate_media_query_strict trimmed with 474 - | Ok () -> () 475 - | Error msg -> 476 - Message_collector.add_typed collector 477 - (`Attr (`Bad_value_generic (`Message (Printf.sprintf 478 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad media query: %s" 479 - attr_value attr_name name msg)))) 422 + (* Validate media attribute on link, style, source elements *) 423 + let is_media_element = name_lower = "link" || name_lower = "style" || name_lower = "source" in 424 + if is_media_element then begin 425 + List.iter (fun (attr_name, attr_value) -> 426 + let attr_lower = String.lowercase_ascii attr_name in 427 + if attr_lower = "media" then begin 428 + let trimmed = String.trim attr_value in 429 + if trimmed <> "" then begin 430 + match Dt_media_query.validate_media_query_strict trimmed with 431 + | Ok () -> () 432 + | Error msg -> 433 + Message_collector.add_typed collector 434 + (`Attr (`Bad_value_generic (`Message (Printf.sprintf 435 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad media query: %s" 436 + attr_value attr_name name msg)))) 437 + end 480 438 end 481 - end 482 - ) attrs 483 - | None -> ()); 439 + ) attrs 440 + end; 484 441 485 - (* Validate RDFa prefix attribute - space-separated list of prefix:iri pairs *) 486 - (match namespace with 487 - | Some _ -> () 488 - | None -> 442 + (* Validate RDFa prefix attribute - space-separated list of prefix:iri pairs *) 489 443 List.iter (fun (attr_name, attr_value) -> 490 444 let attr_lower = String.lowercase_ascii attr_name in 491 445 if attr_lower = "prefix" then begin ··· 507 461 end 508 462 end 509 463 end 510 - ) attrs) 464 + ) attrs 511 465 512 - let end_element _state ~name:_ ~namespace:_ _collector = () 466 + | _ -> () (* Skip non-HTML elements *) 467 + 468 + let end_element _state ~tag:_ _collector = () 513 469 let characters _state _text _collector = () 514 470 let end_document _state _collector = () 515 471
+14 -15
lib/htmlrw_check/specialized/base_checker.ml
··· 11 11 let reset state = 12 12 state.seen_link_or_script <- false 13 13 14 - let start_element state ~name ~namespace ~attrs collector = 15 - match namespace with 16 - | Some _ -> () 17 - | None -> 18 - match String.lowercase_ascii name with 19 - | "link" | "script" -> 20 - state.seen_link_or_script <- true 21 - | "base" -> 22 - if state.seen_link_or_script then 23 - Message_collector.add_typed collector (`Misc `Base_after_link_script); 24 - (* base element must have href or target attribute *) 25 - if not (Attr_utils.has_attr "href" attrs || Attr_utils.has_attr "target" attrs) then 26 - Message_collector.add_typed collector (`Misc `Base_missing_href_or_target) 27 - | _ -> () 14 + let start_element state ~element collector = 15 + match element.Element.tag with 16 + | Tag.Html (`Link | `Script) -> 17 + state.seen_link_or_script <- true 18 + | Tag.Html `Base -> 19 + if state.seen_link_or_script then 20 + Message_collector.add_typed collector (`Misc `Base_after_link_script); 21 + (* base element must have href or target attribute *) 22 + let has_href = Attr.get_href element.attrs |> Option.is_some in 23 + let has_target = Attr.exists (function `Target _ -> true | _ -> false) element.attrs in 24 + if not (has_href || has_target) then 25 + Message_collector.add_typed collector (`Misc `Base_missing_href_or_target) 26 + | _ -> () 28 27 29 - let end_element _state ~name:_ ~namespace:_ _collector = () 28 + let end_element _state ~tag:_ _collector = () 30 29 let characters _state _text _collector = () 31 30 let end_document _state _collector = () 32 31
+8 -8
lib/htmlrw_check/specialized/datetime_checker.ml
··· 445 445 let create () = () 446 446 let reset _state = () 447 447 448 - let start_element _state ~name ~namespace ~attrs collector = 449 - if namespace <> None then () 450 - else begin 451 - let name_lower = String.lowercase_ascii name in 452 - if List.mem name_lower datetime_elements then begin 448 + let start_element _state ~element collector = 449 + match element.Element.tag with 450 + | Tag.Html tag -> 451 + let name = Tag.html_tag_to_string tag in 452 + if List.mem name datetime_elements then begin 453 453 (* Check for datetime attribute *) 454 454 let datetime_attr = List.find_map (fun (k, v) -> 455 455 if String.lowercase_ascii k = "datetime" then Some v else None 456 - ) attrs in 456 + ) element.raw_attrs in 457 457 match datetime_attr with 458 458 | None -> () 459 459 | Some value -> ··· 468 468 Message_collector.add_typed collector 469 469 (`Generic warn_msg) 470 470 end 471 - end 471 + | _ -> () (* Non-HTML elements don't have datetime attributes *) 472 472 473 - let end_element _state ~name:_ ~namespace:_ _collector = () 473 + let end_element _state ~tag:_ _collector = () 474 474 let characters _state _text _collector = () 475 475 let end_document _state _collector = () 476 476
+152 -176
lib/htmlrw_check/specialized/dl_checker.ml
··· 57 57 | ctx :: _ -> Some ctx 58 58 | [] -> None 59 59 60 - let start_element state ~name ~namespace ~attrs collector = 61 - let name_lower = String.lowercase_ascii name in 60 + let start_element state ~element collector = 61 + let name_lower = Tag.tag_to_string element.Element.tag in 62 62 63 63 (* Track parent stack for all HTML elements first *) 64 - if namespace = None then 65 - state.parent_stack <- name_lower :: state.parent_stack; 64 + (match element.tag with 65 + | Tag.Html _ -> state.parent_stack <- name_lower :: state.parent_stack 66 + | _ -> ()); 66 67 67 - if namespace <> None then () 68 - else begin 69 - match name_lower with 70 - | "template" -> 71 - state.in_template <- state.in_template + 1; 72 - (* Track if template is direct child of dl *) 73 - begin match current_dl state with 74 - | Some dl_ctx when state.div_in_dl_stack = [] -> 75 - dl_ctx.has_template <- true 76 - | _ -> () 77 - end 68 + match element.tag with 69 + | Tag.Html `Template -> 70 + state.in_template <- state.in_template + 1; 71 + (* Track if template is direct child of dl *) 72 + (match current_dl state with 73 + | Some dl_ctx when state.div_in_dl_stack = [] -> 74 + dl_ctx.has_template <- true 75 + | _ -> ()) 78 76 79 - | "dl" when state.in_template = 0 -> 80 - (* Check for nested dl - error if direct child of dl OR inside div-in-dl *) 81 - begin match current_div state with 82 - | Some _ -> 83 - Message_collector.add_typed collector 84 - (`Element (`Not_allowed_as_child (`Child "dl", `Parent "div"))) 85 - | None -> 86 - match current_dl state with 87 - | Some _ when state.in_dt_dd = 0 -> 88 - Message_collector.add_typed collector 89 - (`Element (`Not_allowed_as_child (`Child "dl", `Parent "dl"))) 90 - | _ -> () 91 - end; 92 - let ctx = { 93 - has_dt = false; 94 - has_dd = false; 95 - last_was_dt = false; 96 - contains_div = false; 97 - contains_dt_dd = false; 98 - dd_before_dt_error_reported = false; 99 - has_template = false; 100 - } in 101 - state.dl_stack <- ctx :: state.dl_stack 77 + | Tag.Html `Dl when state.in_template = 0 -> 78 + (* Check for nested dl - error if direct child of dl OR inside div-in-dl *) 79 + (match current_div state with 80 + | Some _ -> 81 + Message_collector.add_typed collector 82 + (`Element (`Not_allowed_as_child (`Child "dl", `Parent "div"))) 83 + | None -> 84 + match current_dl state with 85 + | Some _ when state.in_dt_dd = 0 -> 86 + Message_collector.add_typed collector 87 + (`Element (`Not_allowed_as_child (`Child "dl", `Parent "dl"))) 88 + | _ -> ()); 89 + let ctx = { 90 + has_dt = false; 91 + has_dd = false; 92 + last_was_dt = false; 93 + contains_div = false; 94 + contains_dt_dd = false; 95 + dd_before_dt_error_reported = false; 96 + has_template = false; 97 + } in 98 + state.dl_stack <- ctx :: state.dl_stack 102 99 103 - | "div" when state.in_template = 0 -> 104 - begin match current_dl state with 105 - | Some dl_ctx when state.div_in_dl_stack = [] -> 106 - (* Direct div child of dl *) 107 - dl_ctx.contains_div <- true; 108 - (* Check for mixed content - if we already have dt/dd, div is not allowed *) 109 - if dl_ctx.contains_dt_dd then 110 - Message_collector.add_typed collector 111 - (`Element (`Not_allowed_as_child (`Child "div", `Parent "dl"))); 112 - (* Check that role is only presentation or none *) 113 - (match Attr_utils.get_attr "role" attrs with 100 + | Tag.Html `Div when state.in_template = 0 -> 101 + (match current_dl state with 102 + | Some dl_ctx when state.div_in_dl_stack = [] -> 103 + dl_ctx.contains_div <- true; 104 + if dl_ctx.contains_dt_dd then 105 + Message_collector.add_typed collector 106 + (`Element (`Not_allowed_as_child (`Child "div", `Parent "dl"))); 107 + (match Attr.get_role element.attrs with 114 108 | Some role_value -> 115 109 let role_lower = String.lowercase_ascii (String.trim role_value) in 116 110 if role_lower <> "presentation" && role_lower <> "none" then 117 111 Message_collector.add_typed collector (`Li_role `Div_in_dl_bad_role) 118 112 | None -> ()); 119 - let div_ctx = { has_dt = false; has_dd = false; group_count = 0; in_dd_part = false } in 120 - state.div_in_dl_stack <- div_ctx :: state.div_in_dl_stack 121 - | Some _ when state.div_in_dl_stack <> [] -> 122 - Message_collector.add_typed collector 123 - (`Element (`Not_allowed_as_child (`Child "div", `Parent "div"))) 124 - | _ -> () 125 - end 126 - 127 - | "dt" when state.in_template = 0 -> 128 - state.in_dt_dd <- state.in_dt_dd + 1; 129 - begin match current_div state with 130 - | Some div_ctx -> 131 - (* If we've already seen dd, this dt starts a new group - which is not allowed *) 132 - if div_ctx.in_dd_part then begin 133 - Message_collector.add_typed collector 134 - (`Element (`Not_allowed_as_child (`Child "dt", `Parent "div"))); 135 - div_ctx.group_count <- div_ctx.group_count + 1; 136 - div_ctx.in_dd_part <- false 137 - end; 138 - div_ctx.has_dt <- true 139 - | None -> 140 - match current_dl state with 141 - | Some dl_ctx -> 142 - dl_ctx.has_dt <- true; 143 - dl_ctx.last_was_dt <- true; 144 - dl_ctx.contains_dt_dd <- true; 145 - (* Check for mixed content - if we already have div, dt is not allowed *) 146 - if dl_ctx.contains_div then 147 - Message_collector.add_typed collector 148 - (`Element (`Not_allowed_as_child (`Child "dt", `Parent "dl"))) 149 - | None -> 150 - (* dt outside dl context - error *) 151 - let parent = match current_parent state with 152 - | Some p -> p 153 - | None -> "document" 154 - in 155 - Message_collector.add_typed collector 156 - (`Element (`Not_allowed_as_child (`Child "dt", `Parent parent))) 157 - end 113 + let div_ctx = { has_dt = false; has_dd = false; group_count = 0; in_dd_part = false } in 114 + state.div_in_dl_stack <- div_ctx :: state.div_in_dl_stack 115 + | Some _ when state.div_in_dl_stack <> [] -> 116 + Message_collector.add_typed collector 117 + (`Element (`Not_allowed_as_child (`Child "div", `Parent "div"))) 118 + | _ -> ()) 158 119 159 - | "dd" when state.in_template = 0 -> 160 - state.in_dt_dd <- state.in_dt_dd + 1; 161 - begin match current_div state with 162 - | Some div_ctx -> 163 - div_ctx.has_dd <- true; 164 - (* First dd after dt(s) completes the first group *) 165 - if not div_ctx.in_dd_part then begin 166 - div_ctx.in_dd_part <- true; 167 - div_ctx.group_count <- div_ctx.group_count + 1 168 - end 169 - | None -> 170 - match current_dl state with 171 - | Some dl_ctx -> 172 - (* Check if dd appears before any dt - only report once per dl *) 173 - if not dl_ctx.has_dt && not dl_ctx.dd_before_dt_error_reported then begin 174 - dl_ctx.dd_before_dt_error_reported <- true; 175 - Message_collector.add_typed collector 176 - (`Element (`Missing_child_generic (`Parent "dl"))) 177 - end; 178 - dl_ctx.has_dd <- true; 179 - dl_ctx.last_was_dt <- false; 180 - dl_ctx.contains_dt_dd <- true; 181 - (* Check for mixed content *) 182 - if dl_ctx.contains_div then 183 - Message_collector.add_typed collector 184 - (`Element (`Not_allowed_as_child (`Child "dd", `Parent "dl"))) 185 - | None -> 186 - (* dd outside dl context - error *) 187 - let parent = match current_parent state with 188 - | Some p -> p 189 - | None -> "document" 190 - in 191 - Message_collector.add_typed collector 192 - (`Element (`Not_allowed_as_child (`Child "dd", `Parent parent))) 193 - end 120 + | Tag.Html `Dt when state.in_template = 0 -> 121 + state.in_dt_dd <- state.in_dt_dd + 1; 122 + (match current_div state with 123 + | Some div_ctx -> 124 + if div_ctx.in_dd_part then begin 125 + Message_collector.add_typed collector 126 + (`Element (`Not_allowed_as_child (`Child "dt", `Parent "div"))); 127 + div_ctx.group_count <- div_ctx.group_count + 1; 128 + div_ctx.in_dd_part <- false 129 + end; 130 + div_ctx.has_dt <- true 131 + | None -> 132 + match current_dl state with 133 + | Some dl_ctx -> 134 + dl_ctx.has_dt <- true; 135 + dl_ctx.last_was_dt <- true; 136 + dl_ctx.contains_dt_dd <- true; 137 + if dl_ctx.contains_div then 138 + Message_collector.add_typed collector 139 + (`Element (`Not_allowed_as_child (`Child "dt", `Parent "dl"))) 140 + | None -> 141 + let parent = match current_parent state with 142 + | Some p -> p 143 + | None -> "document" 144 + in 145 + Message_collector.add_typed collector 146 + (`Element (`Not_allowed_as_child (`Child "dt", `Parent parent)))) 194 147 195 - | _ -> () 196 - end 148 + | Tag.Html `Dd when state.in_template = 0 -> 149 + state.in_dt_dd <- state.in_dt_dd + 1; 150 + (match current_div state with 151 + | Some div_ctx -> 152 + div_ctx.has_dd <- true; 153 + if not div_ctx.in_dd_part then begin 154 + div_ctx.in_dd_part <- true; 155 + div_ctx.group_count <- div_ctx.group_count + 1 156 + end 157 + | None -> 158 + match current_dl state with 159 + | Some dl_ctx -> 160 + if not dl_ctx.has_dt && not dl_ctx.dd_before_dt_error_reported then begin 161 + dl_ctx.dd_before_dt_error_reported <- true; 162 + Message_collector.add_typed collector 163 + (`Element (`Missing_child_generic (`Parent "dl"))) 164 + end; 165 + dl_ctx.has_dd <- true; 166 + dl_ctx.last_was_dt <- false; 167 + dl_ctx.contains_dt_dd <- true; 168 + if dl_ctx.contains_div then 169 + Message_collector.add_typed collector 170 + (`Element (`Not_allowed_as_child (`Child "dd", `Parent "dl"))) 171 + | None -> 172 + let parent = match current_parent state with 173 + | Some p -> p 174 + | None -> "document" 175 + in 176 + Message_collector.add_typed collector 177 + (`Element (`Not_allowed_as_child (`Child "dd", `Parent parent)))) 197 178 198 - let end_element state ~name ~namespace collector = 199 - if namespace <> None then () 200 - else begin 201 - let name_lower = String.lowercase_ascii name in 179 + | _ -> () 202 180 181 + let end_element state ~tag collector = 182 + match tag with 183 + | Tag.Html _ -> 203 184 (* Pop from parent stack *) 204 185 (match state.parent_stack with 205 - | _ :: rest -> state.parent_stack <- rest 206 - | [] -> ()); 186 + | _ :: rest -> state.parent_stack <- rest 187 + | [] -> ()); 207 188 208 - match name_lower with 209 - | "template" -> 210 - state.in_template <- max 0 (state.in_template - 1) 189 + (match tag with 190 + | Tag.Html `Template -> 191 + state.in_template <- max 0 (state.in_template - 1) 211 192 212 - | "dt" | "dd" when state.in_template = 0 -> 213 - state.in_dt_dd <- max 0 (state.in_dt_dd - 1) 193 + | Tag.Html (`Dt | `Dd) when state.in_template = 0 -> 194 + state.in_dt_dd <- max 0 (state.in_dt_dd - 1) 214 195 215 - | "dl" when state.in_template = 0 -> 216 - begin match state.dl_stack with 217 - | ctx :: rest -> 218 - state.dl_stack <- rest; 219 - (* Check dl content model at end *) 220 - if ctx.contains_dt_dd then begin 221 - (* Direct dt/dd content - must have both *) 222 - if not ctx.has_dt && not ctx.dd_before_dt_error_reported then 223 - Message_collector.add_typed collector 224 - (`Element (`Missing_child_generic (`Parent "dl"))) 225 - else if not ctx.has_dd then begin 226 - if ctx.has_template then 196 + | Tag.Html `Dl when state.in_template = 0 -> 197 + (match state.dl_stack with 198 + | ctx :: rest -> 199 + state.dl_stack <- rest; 200 + if ctx.contains_dt_dd then begin 201 + if not ctx.has_dt && not ctx.dd_before_dt_error_reported then 227 202 Message_collector.add_typed collector 228 - (`Element (`Missing_child_one_of (`Parent "dl", `Children ["dd"]))) 229 - else 203 + (`Element (`Missing_child_generic (`Parent "dl"))) 204 + else if not ctx.has_dd then begin 205 + if ctx.has_template then 206 + Message_collector.add_typed collector 207 + (`Element (`Missing_child_one_of (`Parent "dl", `Children ["dd"]))) 208 + else 209 + Message_collector.add_typed collector 210 + (`Element (`Missing_child (`Parent "dl", `Child "dd"))) 211 + end 212 + else if ctx.last_was_dt then 230 213 Message_collector.add_typed collector 231 214 (`Element (`Missing_child (`Parent "dl", `Child "dd"))) 232 - end 233 - else if ctx.last_was_dt then 234 - Message_collector.add_typed collector 235 - (`Element (`Missing_child (`Parent "dl", `Child "dd"))) 236 - end else if not ctx.contains_div && not ctx.has_dt && not ctx.has_dd then 237 - () 238 - | [] -> () 239 - end 215 + end else if not ctx.contains_div && not ctx.has_dt && not ctx.has_dd then 216 + () 217 + | [] -> ()) 240 218 241 - | "div" when state.in_template = 0 -> 242 - begin match state.div_in_dl_stack with 243 - | div_ctx :: rest -> 244 - state.div_in_dl_stack <- rest; 245 - (* Check div in dl must have both dt and dd *) 246 - if not div_ctx.has_dt && not div_ctx.has_dd then 247 - Message_collector.add_typed collector 248 - (`Element (`Missing_child (`Parent "div", `Child "dd"))) 249 - else if not div_ctx.has_dt then 250 - Message_collector.add_typed collector 251 - (`Element (`Missing_child (`Parent "div", `Child "dt"))) 252 - else if not div_ctx.has_dd then 253 - Message_collector.add_typed collector 254 - (`Element (`Missing_child (`Parent "div", `Child "dd"))) 255 - | [] -> () 256 - end 219 + | Tag.Html `Div when state.in_template = 0 -> 220 + (match state.div_in_dl_stack with 221 + | div_ctx :: rest -> 222 + state.div_in_dl_stack <- rest; 223 + if not div_ctx.has_dt && not div_ctx.has_dd then 224 + Message_collector.add_typed collector 225 + (`Element (`Missing_child (`Parent "div", `Child "dd"))) 226 + else if not div_ctx.has_dt then 227 + Message_collector.add_typed collector 228 + (`Element (`Missing_child (`Parent "div", `Child "dt"))) 229 + else if not div_ctx.has_dd then 230 + Message_collector.add_typed collector 231 + (`Element (`Missing_child (`Parent "div", `Child "dd"))) 232 + | [] -> ()) 257 233 258 - | _ -> () 259 - end 234 + | _ -> ()) 235 + | _ -> () 260 236 261 237 let characters state text collector = 262 238 if state.in_template > 0 then ()
+12 -11
lib/htmlrw_check/specialized/h1_checker.ml
··· 14 14 state.h1_count <- 0; 15 15 state.svg_depth <- 0 16 16 17 - let start_element state ~name ~namespace ~attrs collector = 18 - ignore attrs; 19 - let name_lower = String.lowercase_ascii name in 17 + let start_element state ~element collector = 20 18 (* Track SVG depth - h1 inside SVG (foreignObject, desc) shouldn't count *) 21 - if name_lower = "svg" then 19 + match element.Element.tag with 20 + | Tag.Svg _ -> 22 21 state.svg_depth <- state.svg_depth + 1 23 - else if namespace <> None || state.svg_depth > 0 then 24 - () (* Skip non-HTML namespace or inside SVG *) 25 - else if name_lower = "h1" then begin 22 + | Tag.Html `H1 when state.svg_depth = 0 -> 26 23 state.h1_count <- state.h1_count + 1; 27 24 if state.h1_count > 1 then 28 25 Message_collector.add_typed collector (`Misc `Multiple_h1) 29 - end 26 + | Tag.Html _ when state.svg_depth = 0 -> 27 + () (* Other HTML elements outside SVG *) 28 + | _ -> 29 + () (* Non-HTML or inside SVG *) 30 30 31 - let end_element state ~name ~namespace:_ _collector = 32 - let name_lower = String.lowercase_ascii name in 33 - if name_lower = "svg" && state.svg_depth > 0 then 31 + let end_element state ~tag _collector = 32 + match tag with 33 + | Tag.Svg _ when state.svg_depth > 0 -> 34 34 state.svg_depth <- state.svg_depth - 1 35 + | _ -> () 35 36 36 37 let characters _state _text _collector = () 37 38 let end_document _state _collector = ()
+48 -59
lib/htmlrw_check/specialized/heading_checker.ml
··· 12 12 mutable h1_count : int; 13 13 mutable has_any_heading : bool; 14 14 mutable first_heading_checked : bool; 15 - mutable in_heading : string option; 15 + mutable in_heading : Tag.html_tag option; 16 16 mutable heading_has_text : bool; 17 17 } 18 18 ··· 34 34 state.in_heading <- None; 35 35 state.heading_has_text <- false 36 36 37 - (** Extract heading level from tag name (e.g., "h1" -> 1). *) 38 - let heading_level name = 39 - match String.lowercase_ascii name with 40 - | "h1" -> Some 1 41 - | "h2" -> Some 2 42 - | "h3" -> Some 3 43 - | "h4" -> Some 4 44 - | "h5" -> Some 5 45 - | "h6" -> Some 6 46 - | _ -> None 47 - 48 37 (** Check if text is effectively empty (only whitespace). *) 49 38 let is_empty_text text = 50 39 let rec check i = ··· 57 46 in 58 47 check 0 59 48 60 - let start_element state ~name ~namespace:_ ~attrs:_ collector = 61 - match heading_level name with 62 - | Some level -> 63 - state.has_any_heading <- true; 49 + let start_element state ~element collector = 50 + match element.Element.tag with 51 + | Tag.Html (#Tag.heading_tag as h) -> 52 + let level = match Tag.heading_level h with Some l -> l | None -> 0 in 53 + let name = Tag.html_tag_to_string h in 54 + state.has_any_heading <- true; 64 55 65 - (* Check if this is the first heading *) 66 - if not state.first_heading_checked then begin 67 - state.first_heading_checked <- true; 68 - if level <> 1 then 56 + (* Check if this is the first heading *) 57 + if not state.first_heading_checked then begin 58 + state.first_heading_checked <- true; 59 + if level <> 1 then 60 + Message_collector.add_typed collector 61 + (`Generic (Printf.sprintf 62 + "First heading in document is <%s>, should typically be <h1>" name)) 63 + end; 64 + 65 + (* Track h1 count *) 66 + if level = 1 then begin 67 + state.h1_count <- state.h1_count + 1; 68 + if state.h1_count > 1 then 69 + Message_collector.add_typed collector (`Misc `Multiple_h1) 70 + end; 71 + 72 + (* Check for skipped levels *) 73 + begin match state.current_level with 74 + | None -> 75 + state.current_level <- Some level 76 + | Some prev_level -> 77 + let diff = level - prev_level in 78 + if diff > 1 then 69 79 Message_collector.add_typed collector 70 80 (`Generic (Printf.sprintf 71 - "First heading in document is <%s>, should typically be <h1>" name)) 72 - end; 81 + "Heading level skipped: <%s> follows <h%d>, skipping %d level%s. This can confuse screen reader users" 82 + name prev_level (diff - 1) (if diff > 2 then "s" else ""))); 83 + state.current_level <- Some level 84 + end; 73 85 74 - (* Track h1 count *) 75 - if level = 1 then begin 76 - state.h1_count <- state.h1_count + 1; 77 - if state.h1_count > 1 then 78 - Message_collector.add_typed collector (`Misc `Multiple_h1) 79 - end; 86 + (* Track that we're in a heading to check for empty content *) 87 + state.in_heading <- Some h; 88 + state.heading_has_text <- false 89 + | _ -> () 80 90 81 - (* Check for skipped levels *) 82 - begin match state.current_level with 83 - | None -> 84 - state.current_level <- Some level 85 - | Some prev_level -> 86 - let diff = level - prev_level in 87 - if diff > 1 then 88 - Message_collector.add_typed collector 89 - (`Generic (Printf.sprintf 90 - "Heading level skipped: <%s> follows <h%d>, skipping %d level%s. This can confuse screen reader users" 91 - name prev_level (diff - 1) (if diff > 2 then "s" else ""))); 92 - state.current_level <- Some level 93 - end; 94 - 95 - (* Track that we're in a heading to check for empty content *) 96 - state.in_heading <- Some name; 97 - state.heading_has_text <- false 98 - 99 - | None -> 100 - (* Not a heading element *) 101 - () 102 - 103 - let end_element state ~name ~namespace:_ collector = 104 - match state.in_heading with 105 - | Some heading when heading = name -> 106 - if not state.heading_has_text then 107 - Message_collector.add_typed collector 108 - (`Generic (Printf.sprintf 109 - "Heading <%s> is empty or contains only whitespace. Empty headings are problematic for screen readers" name)); 110 - state.in_heading <- None; 111 - state.heading_has_text <- false 91 + let end_element state ~tag collector = 92 + match state.in_heading, tag with 93 + | Some h, Tag.Html h2 when h = h2 -> 94 + if not state.heading_has_text then 95 + Message_collector.add_typed collector 96 + (`Generic (Printf.sprintf 97 + "Heading <%s> is empty or contains only whitespace. Empty headings are problematic for screen readers" 98 + (Tag.html_tag_to_string h))); 99 + state.in_heading <- None; 100 + state.heading_has_text <- false 112 101 | _ -> () 113 102 114 103 let characters state text _collector =
+23 -29
lib/htmlrw_check/specialized/importmap_checker.ml
··· 265 265 266 266 List.rev !errors 267 267 268 - let start_element state ~name ~namespace ~attrs _collector = 269 - if namespace <> None then () 270 - else begin 271 - let name_lower = String.lowercase_ascii name in 272 - if name_lower = "script" then begin 273 - (* Check if type="importmap" *) 274 - let type_attr = List.find_opt (fun (n, _) -> 275 - String.lowercase_ascii n = "type" 276 - ) attrs in 277 - match type_attr with 278 - | Some (_, v) when String.lowercase_ascii v = "importmap" -> 279 - state.in_importmap <- true; 280 - Buffer.clear state.content 281 - | _ -> () 282 - end 283 - end 268 + let start_element state ~element _collector = 269 + match element.Element.tag with 270 + | Tag.Html `Script -> 271 + (* Check if type="importmap" *) 272 + let type_attr = List.find_opt (fun (n, _) -> 273 + String.lowercase_ascii n = "type" 274 + ) element.raw_attrs in 275 + (match type_attr with 276 + | Some (_, v) when String.lowercase_ascii v = "importmap" -> 277 + state.in_importmap <- true; 278 + Buffer.clear state.content 279 + | _ -> ()) 280 + | _ -> () (* Only script elements can be importmaps *) 284 281 285 282 let error_to_typed = function 286 283 | InvalidJSON _ -> `Importmap `Invalid_json ··· 295 292 | InvalidScopeValue _ -> `Importmap `Scopes_value_invalid_url 296 293 | ScopeValueNotObject -> `Importmap `Scopes_values_not_object 297 294 298 - let end_element state ~name ~namespace collector = 299 - if namespace <> None then () 300 - else begin 301 - let name_lower = String.lowercase_ascii name in 302 - if name_lower = "script" && state.in_importmap then begin 303 - let content = Buffer.contents state.content in 304 - let errors = validate_importmap content in 305 - List.iter (fun err -> 306 - Message_collector.add_typed collector (error_to_typed err) 307 - ) errors; 308 - state.in_importmap <- false 309 - end 310 - end 295 + let end_element state ~tag collector = 296 + match tag with 297 + | Tag.Html `Script when state.in_importmap -> 298 + let content = Buffer.contents state.content in 299 + let errors = validate_importmap content in 300 + List.iter (fun err -> 301 + Message_collector.add_typed collector (error_to_typed err) 302 + ) errors; 303 + state.in_importmap <- false 304 + | _ -> () 311 305 312 306 let characters state text _collector = 313 307 if state.in_importmap then
+37 -40
lib/htmlrw_check/specialized/label_checker.ml
··· 50 50 state.labels_for <- []; 51 51 state.labelable_ids <- [] 52 52 53 - let start_element state ~name ~namespace ~attrs collector = 54 - if namespace <> None then () 55 - else begin 56 - let name_lower = String.lowercase_ascii name in 53 + let start_element state ~element collector = 54 + match element.Element.tag with 55 + | Tag.Html `Label -> 56 + state.in_label <- true; 57 + state.label_depth <- 1; (* Start at 1 for the label element itself *) 58 + state.labelable_count <- 0; 59 + let for_value = get_attr element.raw_attrs "for" in 60 + let has_role = get_attr element.raw_attrs "role" <> None in 61 + let has_aria_label = get_attr element.raw_attrs "aria-label" <> None in 62 + state.label_for_value <- for_value; 63 + state.label_has_role <- has_role; 64 + state.label_has_aria_label <- has_aria_label; 65 + (* Track this label if it has for= and role/aria-label *) 66 + (match for_value with 67 + | Some target when has_role || has_aria_label -> 68 + state.labels_for <- { for_target = target; has_role; has_aria_label } :: state.labels_for 69 + | _ -> ()) 70 + 71 + | Tag.Html tag -> 72 + let name_lower = String.lowercase_ascii (Tag.tag_to_string (Tag.Html tag)) in 57 73 58 - if name_lower = "label" then begin 59 - state.in_label <- true; 60 - state.label_depth <- 1; (* Start at 1 for the label element itself *) 61 - state.labelable_count <- 0; 62 - let for_value = get_attr attrs "for" in 63 - let has_role = get_attr attrs "role" <> None in 64 - let has_aria_label = get_attr attrs "aria-label" <> None in 65 - state.label_for_value <- for_value; 66 - state.label_has_role <- has_role; 67 - state.label_has_aria_label <- has_aria_label; 68 - (* Track this label if it has for= and role/aria-label *) 69 - (match for_value with 70 - | Some target when has_role || has_aria_label -> 71 - state.labels_for <- { for_target = target; has_role; has_aria_label } :: state.labels_for 72 - | _ -> ()) 73 - end; 74 74 (* Track labelable element IDs *) 75 75 (if List.mem name_lower labelable_elements then 76 - match get_attr attrs "id" with 76 + match get_attr element.raw_attrs "id" with 77 77 | Some id -> state.labelable_ids <- id :: state.labelable_ids 78 78 | None -> ()); 79 79 80 - if state.in_label && name_lower <> "label" then begin 80 + if state.in_label then begin 81 81 state.label_depth <- state.label_depth + 1; 82 82 83 83 (* Check for labelable elements inside label *) ··· 89 89 (* Check if label has for attribute and descendant has mismatched id *) 90 90 (match state.label_for_value with 91 91 | Some for_value -> 92 - let descendant_id = get_attr attrs "id" in 92 + let descendant_id = get_attr element.raw_attrs "id" in 93 93 (match descendant_id with 94 94 | None -> 95 95 Message_collector.add_typed collector (`Label `For_id_mismatch) ··· 99 99 | None -> ()) 100 100 end 101 101 end 102 - end 103 102 104 - let end_element state ~name ~namespace collector = 105 - if namespace <> None then () 106 - else begin 107 - let name_lower = String.lowercase_ascii name in 103 + | _ -> () (* Non-HTML elements (SVG, MathML, etc.) *) 108 104 109 - if state.in_label then begin 110 - state.label_depth <- state.label_depth - 1; 105 + let end_element state ~tag collector = 106 + if state.in_label then begin 107 + state.label_depth <- state.label_depth - 1; 111 108 112 - if name_lower = "label" && state.label_depth = 0 then begin 113 - if state.label_has_role && state.labelable_count > 0 then 114 - Message_collector.add_typed collector (`Label `Role_on_ancestor); 115 - state.in_label <- false; 116 - state.labelable_count <- 0; 117 - state.label_for_value <- None; 118 - state.label_has_role <- false; 119 - state.label_has_aria_label <- false 120 - end 121 - end 109 + match tag with 110 + | Tag.Html `Label when state.label_depth = 0 -> 111 + if state.label_has_role && state.labelable_count > 0 then 112 + Message_collector.add_typed collector (`Label `Role_on_ancestor); 113 + state.in_label <- false; 114 + state.labelable_count <- 0; 115 + state.label_for_value <- None; 116 + state.label_has_role <- false; 117 + state.label_has_aria_label <- false 118 + | _ -> () 122 119 end 123 120 124 121 let characters _state _text _collector = ()
+4 -3
lib/htmlrw_check/specialized/language_checker.ml
··· 89 89 | _ -> () 90 90 end 91 91 92 - let start_element _state ~name ~namespace ~attrs collector = 92 + let start_element _state ~element collector = 93 93 let location = None in 94 - process_language_attrs ~element:name ~namespace ~attrs ~location collector 94 + let name = Tag.tag_to_string element.Element.tag in 95 + process_language_attrs ~element:name ~namespace:None ~attrs:element.raw_attrs ~location collector 95 96 96 - let end_element _state ~name:_ ~namespace:_ _collector = 97 + let end_element _state ~tag:_ _collector = 97 98 () 98 99 99 100 let characters _state _text _collector =
+5 -2
lib/htmlrw_check/specialized/microdata_checker.ml
··· 270 270 let all_nodes = Hashtbl.to_seq_keys graph |> List.of_seq in 271 271 check_all_nodes [] all_nodes 272 272 273 - let start_element state ~name ~namespace:_ ~attrs collector = 273 + let start_element state ~element collector = 274 + let name = Tag.tag_to_string element.Element.tag in 275 + let attrs = element.raw_attrs in 274 276 let location = None in 275 277 track_id state attrs; 276 278 process_microdata_attrs state ~element:name ~attrs ~location collector 277 279 278 - let end_element state ~name ~namespace:_ _collector = 280 + let end_element state ~tag _collector = 281 + let name = Tag.tag_to_string tag in 279 282 (* Pop itemscope from stack if this element had one *) 280 283 match state.scope_stack with 281 284 | scope :: rest when scope.element = name ->
+9 -8
lib/htmlrw_check/specialized/mime_type_checker.ml
··· 156 156 if String.lowercase_ascii k = String.lowercase_ascii name then Some v else None 157 157 ) attrs 158 158 159 - let start_element _state ~name ~namespace ~attrs collector = 160 - if namespace <> None then () 161 - else begin 159 + let start_element _state ~element collector = 160 + match element.Element.tag with 161 + | Tag.Html tag -> 162 + let name = Tag.html_tag_to_string tag in 162 163 let name_lower = String.lowercase_ascii name in 163 - match List.assoc_opt name_lower mime_type_attrs with 164 + (match List.assoc_opt name_lower mime_type_attrs with 164 165 | None -> () 165 166 | Some type_attrs -> 166 167 List.iter (fun attr_name -> 167 - match get_attr_value attr_name attrs with 168 + match get_attr_value attr_name element.raw_attrs with 168 169 | None -> () 169 170 | Some value -> 170 171 (* Don't validate empty type attributes or special script types *) ··· 186 187 | Some err -> 187 188 Message_collector.add_typed collector 188 189 (`Attr (`Bad_value_generic (`Message err))) 189 - ) type_attrs 190 - end 190 + ) type_attrs) 191 + | _ -> () (* Non-HTML elements don't have MIME type checks *) 191 192 192 - let end_element _state ~name:_ ~namespace:_ _collector = () 193 + let end_element _state ~tag:_ _collector = () 193 194 let characters _state _text _collector = () 194 195 let end_document _state _collector = () 195 196
+2 -2
lib/htmlrw_check/specialized/normalization_checker.ml
··· 40 40 if end_pos = len then s 41 41 else String.sub s 0 end_pos 42 42 43 - let start_element _state ~name:_ ~namespace:_ ~attrs:_ _collector = () 43 + let start_element _state ~element:_ _collector = () 44 44 45 - let end_element _state ~name:_ ~namespace:_ _collector = () 45 + let end_element _state ~tag:_ _collector = () 46 46 47 47 let characters _state text collector = 48 48 (* Skip empty text or whitespace-only text *)
+90 -91
lib/htmlrw_check/specialized/picture_checker.ml
··· 93 93 let check_img_attrs attrs collector = 94 94 check_disallowed_attrs "img" disallowed_img_attrs attrs collector 95 95 96 - let start_element state ~name ~namespace ~attrs collector = 97 - let name_lower = String.lowercase_ascii name in 96 + let start_element state ~element collector = 97 + let name_lower = Tag.tag_to_string element.Element.tag in 98 + let attrs = element.raw_attrs in 98 99 99 100 (* Check for disallowed children of picture first - even foreign content *) 100 101 if state.in_picture && state.picture_depth = 1 then begin ··· 103 104 end; 104 105 105 106 (* Rest of checks only apply to HTML namespace elements *) 106 - match namespace with 107 - | Some _ -> () 108 - | None -> 109 - (match name_lower with 110 - | "picture" -> 111 - (* Check if picture is in a disallowed parent context *) 112 - (match state.parent_stack with 113 - | parent :: _ when List.mem parent disallowed_picture_parents -> 114 - Message_collector.add_typed collector 115 - (`Element (`Not_allowed_as_child (`Child "picture", `Parent parent))) 116 - | _ -> ()); 117 - check_picture_attrs attrs collector; 118 - state.in_picture <- true; 119 - state.has_img_in_picture <- false; 120 - state.picture_depth <- 0; 121 - state.children_in_picture <- []; 122 - state.last_was_img <- false; 123 - state.has_source_after_img <- false; 124 - state.has_always_matching_source <- false; 125 - state.source_after_always_matching <- false 107 + (match element.tag with 108 + | Tag.Html `Picture -> 109 + (* Check if picture is in a disallowed parent context *) 110 + (match state.parent_stack with 111 + | parent :: _ when List.mem parent disallowed_picture_parents -> 112 + Message_collector.add_typed collector 113 + (`Element (`Not_allowed_as_child (`Child "picture", `Parent parent))) 114 + | _ -> ()); 115 + check_picture_attrs attrs collector; 116 + state.in_picture <- true; 117 + state.has_img_in_picture <- false; 118 + state.picture_depth <- 0; 119 + state.children_in_picture <- []; 120 + state.last_was_img <- false; 121 + state.has_source_after_img <- false; 122 + state.has_always_matching_source <- false; 123 + state.source_after_always_matching <- false 126 124 127 - | "source" when state.in_picture && state.picture_depth = 1 -> 128 - check_source_attrs_in_picture attrs collector; 129 - state.children_in_picture <- "source" :: state.children_in_picture; 130 - if state.last_was_img then 131 - state.has_source_after_img <- true; 132 - if state.has_always_matching_source then 133 - state.source_after_always_matching <- true; 134 - (* A source is "always matching" if it has no media/type, or media="" or media="all" *) 135 - let media_value = Attr_utils.get_attr "media" attrs in 136 - let has_type = Attr_utils.has_attr "type" attrs in 137 - let is_media_all = match media_value with 138 - | Some v -> String.lowercase_ascii (String.trim v) = "all" 139 - | None -> false in 140 - let is_media_empty = match media_value with 141 - | Some v -> String.trim v = "" 142 - | None -> false in 143 - let is_always_matching = match media_value with 144 - | None -> not has_type 145 - | Some v -> 146 - let trimmed = String.trim v in 147 - trimmed = "" || String.lowercase_ascii trimmed = "all" 148 - in 149 - if is_always_matching then begin 150 - state.has_always_matching_source <- true; 151 - (* Only set flags to true, never reset to false *) 152 - if is_media_all then state.always_matching_is_media_all <- true; 153 - if is_media_empty then state.always_matching_is_media_empty <- true 154 - end 125 + | Tag.Html `Source when state.in_picture && state.picture_depth = 1 -> 126 + check_source_attrs_in_picture attrs collector; 127 + state.children_in_picture <- "source" :: state.children_in_picture; 128 + if state.last_was_img then 129 + state.has_source_after_img <- true; 130 + if state.has_always_matching_source then 131 + state.source_after_always_matching <- true; 132 + (* A source is "always matching" if it has no media/type, or media="" or media="all" *) 133 + let media_value = Attr_utils.get_attr "media" attrs in 134 + let has_type = Attr_utils.has_attr "type" attrs in 135 + let is_media_all = match media_value with 136 + | Some v -> String.lowercase_ascii (String.trim v) = "all" 137 + | None -> false in 138 + let is_media_empty = match media_value with 139 + | Some v -> String.trim v = "" 140 + | None -> false in 141 + let is_always_matching = match media_value with 142 + | None -> not has_type 143 + | Some v -> 144 + let trimmed = String.trim v in 145 + trimmed = "" || String.lowercase_ascii trimmed = "all" 146 + in 147 + if is_always_matching then begin 148 + state.has_always_matching_source <- true; 149 + if is_media_all then state.always_matching_is_media_all <- true; 150 + if is_media_empty then state.always_matching_is_media_empty <- true 151 + end 155 152 156 - | "img" when state.in_picture && state.picture_depth = 1 -> 157 - check_img_attrs attrs collector; 158 - state.has_img_in_picture <- true; 159 - state.children_in_picture <- "img" :: state.children_in_picture; 160 - state.last_was_img <- true; 161 - let img_count = List.length (List.filter (( = ) "img") state.children_in_picture) in 162 - if img_count > 1 then 163 - report_disallowed_child "picture" "img" collector; 164 - if state.has_always_matching_source && Attr_utils.has_attr "srcset" attrs then 165 - Message_collector.add_typed collector 166 - (if state.always_matching_is_media_all then `Misc `Media_all 167 - else if state.always_matching_is_media_empty then `Misc `Media_empty 168 - else `Srcset `Source_needs_media_or_type) 153 + | Tag.Html `Img when state.in_picture && state.picture_depth = 1 -> 154 + check_img_attrs attrs collector; 155 + state.has_img_in_picture <- true; 156 + state.children_in_picture <- "img" :: state.children_in_picture; 157 + state.last_was_img <- true; 158 + let img_count = List.length (List.filter (( = ) "img") state.children_in_picture) in 159 + if img_count > 1 then 160 + report_disallowed_child "picture" "img" collector; 161 + if state.has_always_matching_source && Attr_utils.has_attr "srcset" attrs then 162 + Message_collector.add_typed collector 163 + (if state.always_matching_is_media_all then `Misc `Media_all 164 + else if state.always_matching_is_media_empty then `Misc `Media_empty 165 + else `Srcset `Source_needs_media_or_type) 169 166 170 - | "script" when state.in_picture && state.picture_depth = 1 -> 171 - state.children_in_picture <- "script" :: state.children_in_picture 167 + | Tag.Html `Script when state.in_picture && state.picture_depth = 1 -> 168 + state.children_in_picture <- "script" :: state.children_in_picture 172 169 173 - | "template" when state.in_picture && state.picture_depth = 1 -> 174 - state.children_in_picture <- "template" :: state.children_in_picture 170 + | Tag.Html `Template when state.in_picture && state.picture_depth = 1 -> 171 + state.children_in_picture <- "template" :: state.children_in_picture 175 172 176 - | "img" -> 177 - check_img_attrs attrs collector 173 + | Tag.Html `Img -> 174 + check_img_attrs attrs collector 178 175 179 - | _ -> ()); 176 + | _ -> ()); 180 177 181 178 (* Track depth when inside picture *) 182 179 if state.in_picture then 183 180 state.picture_depth <- state.picture_depth + 1; 184 181 185 182 (* Push to parent stack (only HTML namespace elements) *) 186 - if namespace = None then 187 - state.parent_stack <- name_lower :: state.parent_stack 183 + (match element.tag with 184 + | Tag.Html _ -> state.parent_stack <- name_lower :: state.parent_stack 185 + | _ -> ()) 188 186 189 - let end_element state ~name ~namespace collector = 190 - match namespace with 191 - | Some _ -> () 192 - | None -> 193 - let name_lower = String.lowercase_ascii name in 194 - 187 + let end_element state ~tag collector = 188 + match tag with 189 + | Tag.Html _ -> 190 + let name_lower = Tag.tag_to_string tag in 195 191 if state.in_picture then 196 192 state.picture_depth <- state.picture_depth - 1; 197 193 198 - if name_lower = "picture" && state.picture_depth = 0 then begin 199 - if not state.has_img_in_picture then 200 - Message_collector.add_typed collector (`Srcset `Picture_missing_img); 201 - if state.has_source_after_img then 202 - report_disallowed_child "picture" "source" collector; 203 - if state.source_after_always_matching then 204 - Message_collector.add_typed collector 205 - (if state.always_matching_is_media_all then `Misc `Media_all 206 - else if state.always_matching_is_media_empty then `Misc `Media_empty 207 - else `Srcset `Source_needs_media_or_type); 208 - state.in_picture <- false 209 - end; 194 + (match tag with 195 + | Tag.Html `Picture when state.picture_depth = 0 -> 196 + if not state.has_img_in_picture then 197 + Message_collector.add_typed collector (`Srcset `Picture_missing_img); 198 + if state.has_source_after_img then 199 + report_disallowed_child "picture" "source" collector; 200 + if state.source_after_always_matching then 201 + Message_collector.add_typed collector 202 + (if state.always_matching_is_media_all then `Misc `Media_all 203 + else if state.always_matching_is_media_empty then `Misc `Media_empty 204 + else `Srcset `Source_needs_media_or_type); 205 + state.in_picture <- false 206 + | _ -> ()); 210 207 211 - state.parent_stack <- match state.parent_stack with _ :: rest -> rest | [] -> [] 208 + ignore name_lower; 209 + state.parent_stack <- (match state.parent_stack with _ :: rest -> rest | [] -> []) 210 + | _ -> () 212 211 213 212 let characters state text collector = 214 213 (* Text in picture element is not allowed *)
+63 -71
lib/htmlrw_check/specialized/ruby_checker.ml
··· 26 26 state.in_template <- 0 27 27 28 28 (** Check if element is phrasing content that can appear before rt *) 29 - let is_phrasing_content name = 30 - let name_lower = String.lowercase_ascii name in 31 - (* rt and rp are special - they don't count as "content before rt" *) 32 - name_lower <> "rt" && name_lower <> "rp" 29 + let is_phrasing_content tag = 30 + match tag with 31 + | Tag.Html `Rt | Tag.Html `Rp -> false 32 + | _ -> true 33 33 34 - let start_element state ~name ~namespace ~attrs _collector = 35 - ignore attrs; 36 - if namespace <> None then () 37 - else begin 38 - let name_lower = String.lowercase_ascii name in 34 + let start_element state ~element _collector = 35 + match element.Element.tag with 36 + | Tag.Html `Template -> 37 + state.in_template <- state.in_template + 1 39 38 40 - if name_lower = "template" then 41 - state.in_template <- state.in_template + 1; 39 + | Tag.Html `Ruby when state.in_template = 0 -> 40 + (* Push new ruby context *) 41 + let info = { 42 + has_rt = false; 43 + has_content_before_rt = false; 44 + saw_rt = false; 45 + depth = 1; (* Set depth to 1 for the ruby element itself *) 46 + } in 47 + state.ruby_stack <- info :: state.ruby_stack 42 48 43 - if state.in_template > 0 then () 44 - else begin 45 - if name_lower = "ruby" then begin 46 - (* Push new ruby context *) 47 - let info = { 48 - has_rt = false; 49 - has_content_before_rt = false; 50 - saw_rt = false; 51 - depth = 0; 52 - } in 53 - state.ruby_stack <- info :: state.ruby_stack 49 + | tag when state.in_template = 0 -> 50 + (match state.ruby_stack with 51 + | info :: _ -> 52 + (* Inside a ruby element *) 53 + if info.depth = 1 then begin 54 + (* Direct children of ruby *) 55 + match tag with 56 + | Tag.Html `Rt -> 57 + info.has_rt <- true; 58 + info.saw_rt <- true 59 + | _ when is_phrasing_content tag -> 60 + if not info.saw_rt then 61 + info.has_content_before_rt <- true 62 + | _ -> () 54 63 end; 64 + info.depth <- info.depth + 1 65 + | [] -> ()) 55 66 56 - match state.ruby_stack with 57 - | info :: _ -> 58 - (* Inside a ruby element *) 59 - if name_lower = "ruby" then begin 60 - (* This is the opening of ruby, set depth to 1 *) 61 - info.depth <- 1 62 - end else begin 63 - if info.depth = 1 then begin 64 - (* Direct children of ruby *) 65 - if name_lower = "rt" then begin 66 - info.has_rt <- true; 67 - info.saw_rt <- true 68 - end else if is_phrasing_content name_lower then begin 69 - if not info.saw_rt then 70 - info.has_content_before_rt <- true 71 - end 72 - end; 73 - info.depth <- info.depth + 1 74 - end 75 - | [] -> () 76 - end 77 - end 67 + | _ -> () (* In template or non-HTML element *) 68 + 69 + let end_element state ~tag collector = 70 + match tag with 71 + | Tag.Html `Template when state.in_template > 0 -> 72 + state.in_template <- state.in_template - 1 78 73 79 - let end_element state ~name ~namespace collector = 80 - if namespace <> None then () 81 - else begin 82 - let name_lower = String.lowercase_ascii name in 74 + | Tag.Html `Ruby when state.in_template = 0 -> 75 + (match state.ruby_stack with 76 + | info :: rest -> 77 + info.depth <- info.depth - 1; 78 + (* Check if this is the closing ruby tag (depth becomes 0 when ruby closes) *) 79 + if info.depth <= 0 then begin 80 + (* Closing ruby element - validate *) 81 + if not info.has_rt then 82 + (* Empty ruby or ruby without any rt - needs rp or rt *) 83 + Message_collector.add_typed collector 84 + (`Element (`Missing_child_one_of (`Parent "ruby", `Children ["rp"; "rt"]))) 85 + else if not info.has_content_before_rt then 86 + (* Has rt but missing content before it - needs content *) 87 + Message_collector.add_typed collector 88 + (`Element (`Missing_child (`Parent "ruby", `Child "rt"))); 89 + state.ruby_stack <- rest 90 + end 91 + | [] -> ()) 83 92 84 - if name_lower = "template" && state.in_template > 0 then 85 - state.in_template <- state.in_template - 1; 93 + | _ when state.in_template = 0 -> 94 + (match state.ruby_stack with 95 + | info :: _ -> 96 + info.depth <- info.depth - 1 97 + | [] -> ()) 86 98 87 - if state.in_template > 0 then () 88 - else begin 89 - match state.ruby_stack with 90 - | info :: rest -> 91 - info.depth <- info.depth - 1; 92 - (* Check if this is the closing ruby tag (depth becomes 0 when ruby closes) *) 93 - if name_lower = "ruby" && info.depth <= 0 then begin 94 - (* Closing ruby element - validate *) 95 - if not info.has_rt then 96 - (* Empty ruby or ruby without any rt - needs rp or rt *) 97 - Message_collector.add_typed collector 98 - (`Element (`Missing_child_one_of (`Parent "ruby", `Children ["rp"; "rt"]))) 99 - else if not info.has_content_before_rt then 100 - (* Has rt but missing content before it - needs content *) 101 - Message_collector.add_typed collector 102 - (`Element (`Missing_child (`Parent "ruby", `Child "rt"))); 103 - state.ruby_stack <- rest 104 - end 105 - | [] -> () 106 - end 107 - end 99 + | _ -> () (* In template or non-HTML element *) 108 100 109 101 let characters state text _collector = 110 102 (* Text content counts as phrasing content before rt *)
+33 -44
lib/htmlrw_check/specialized/source_checker.ml
··· 23 23 | ctx :: _ -> ctx 24 24 | [] -> Other 25 25 26 - let start_element state ~name ~namespace ~attrs collector = 27 - if namespace <> None then () 28 - else begin 29 - let name_lower = String.lowercase_ascii name in 30 - match name_lower with 31 - | "picture" -> 32 - state.context_stack <- Picture :: state.context_stack 33 - | "video" -> 34 - state.context_stack <- Video :: state.context_stack 35 - | "audio" -> 36 - state.context_stack <- Audio :: state.context_stack 37 - | "source" -> 38 - let ctx = current_context state in 39 - begin match ctx with 40 - | Video | Audio -> 41 - if Attr_utils.has_attr "srcset" attrs then 42 - Message_collector.add_typed collector 43 - (`Attr (`Not_allowed (`Attr "srcset", `Elem "source"))); 44 - if Attr_utils.has_attr "sizes" attrs then 45 - Message_collector.add_typed collector 46 - (`Attr (`Not_allowed (`Attr "sizes", `Elem "source"))); 47 - if Attr_utils.has_attr "width" attrs then 48 - Message_collector.add_typed collector 49 - (`Attr (`Not_allowed (`Attr "width", `Elem "source"))); 50 - if Attr_utils.has_attr "height" attrs then 51 - Message_collector.add_typed collector 52 - (`Attr (`Not_allowed (`Attr "height", `Elem "source"))) 53 - | Picture | Other -> () 54 - end 55 - | _ -> 56 - (* Any other element maintains current context *) 57 - () 58 - end 26 + let start_element state ~element collector = 27 + match element.Element.tag with 28 + | Tag.Html `Picture -> 29 + state.context_stack <- Picture :: state.context_stack 30 + | Tag.Html `Video -> 31 + state.context_stack <- Video :: state.context_stack 32 + | Tag.Html `Audio -> 33 + state.context_stack <- Audio :: state.context_stack 34 + | Tag.Html `Source -> 35 + let ctx = current_context state in 36 + (match ctx with 37 + | Video | Audio -> 38 + if Attr_utils.has_attr "srcset" element.raw_attrs then 39 + Message_collector.add_typed collector 40 + (`Attr (`Not_allowed (`Attr "srcset", `Elem "source"))); 41 + if Attr_utils.has_attr "sizes" element.raw_attrs then 42 + Message_collector.add_typed collector 43 + (`Attr (`Not_allowed (`Attr "sizes", `Elem "source"))); 44 + if Attr_utils.has_attr "width" element.raw_attrs then 45 + Message_collector.add_typed collector 46 + (`Attr (`Not_allowed (`Attr "width", `Elem "source"))); 47 + if Attr_utils.has_attr "height" element.raw_attrs then 48 + Message_collector.add_typed collector 49 + (`Attr (`Not_allowed (`Attr "height", `Elem "source"))) 50 + | Picture | Other -> ()) 51 + | _ -> () 59 52 60 - let end_element state ~name ~namespace _collector = 61 - if namespace <> None then () 62 - else begin 63 - let name_lower = String.lowercase_ascii name in 64 - match name_lower with 65 - | "picture" | "video" | "audio" -> 66 - (match state.context_stack with 67 - | _ :: rest -> state.context_stack <- rest 68 - | [] -> ()) 69 - | _ -> () 70 - end 53 + let end_element state ~tag _collector = 54 + match tag with 55 + | Tag.Html (`Picture | `Video | `Audio) -> 56 + (match state.context_stack with 57 + | _ :: rest -> state.context_stack <- rest 58 + | [] -> ()) 59 + | _ -> () 71 60 72 61 let characters _state _text _collector = () 73 62
+26 -31
lib/htmlrw_check/specialized/srcset_sizes_checker.ml
··· 960 960 Message_collector.add_typed collector 961 961 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9csrcset\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Mixing width and density descriptors is not allowed." value element_name)))) 962 962 963 - let start_element _state ~name ~namespace ~attrs collector = 964 - let name_lower = String.lowercase_ascii name in 965 - 966 - (* SVG image elements should not have srcset *) 967 - if namespace <> None && name_lower = "image" then begin 968 - if Attr_utils.get_attr "srcset" attrs <> None then 963 + let start_element _state ~element collector = 964 + match element.Element.tag with 965 + | Tag.Svg "image" -> 966 + (* SVG image elements should not have srcset *) 967 + if Attr_utils.get_attr "srcset" element.Element.raw_attrs <> None then 969 968 Message_collector.add_typed collector 970 969 (`Attr (`Not_allowed (`Attr "srcset", `Elem "image"))) 971 - end; 970 + | Tag.Html (`Img | `Source as tag) -> 971 + let name_lower = Tag.html_tag_to_string tag in 972 + let attrs = element.raw_attrs in 973 + let sizes_value = Attr_utils.get_attr "sizes" attrs in 974 + let srcset_value = Attr_utils.get_attr "srcset" attrs in 975 + let has_sizes = sizes_value <> None in 976 + let has_srcset = srcset_value <> None in 972 977 973 - if namespace <> None then () 974 - else begin 975 - (* Check sizes and srcset on img and source *) 976 - if name_lower = "img" || name_lower = "source" then begin 977 - let sizes_value = Attr_utils.get_attr "sizes" attrs in 978 - let srcset_value = Attr_utils.get_attr "srcset" attrs in 979 - let has_sizes = sizes_value <> None in 980 - let has_srcset = srcset_value <> None in 981 - 982 - (* Validate sizes if present *) 983 - (match sizes_value with 984 - | Some v -> ignore (validate_sizes v name_lower collector) 985 - | None -> ()); 978 + (* Validate sizes if present *) 979 + (match sizes_value with 980 + | Some v -> ignore (validate_sizes v name_lower collector) 981 + | None -> ()); 986 982 987 - (* Validate srcset if present *) 988 - (match srcset_value with 989 - | Some v -> validate_srcset v name_lower has_sizes collector 990 - | None -> ()); 983 + (* Validate srcset if present *) 984 + (match srcset_value with 985 + | Some v -> validate_srcset v name_lower has_sizes collector 986 + | None -> ()); 991 987 992 - (* Error: sizes without srcset on img *) 993 - if name_lower = "img" && has_sizes && not has_srcset then 994 - Message_collector.add_typed collector 995 - (`Srcset `Sizes_without_srcset) 996 - end 997 - end 988 + (* Error: sizes without srcset on img *) 989 + if name_lower = "img" && has_sizes && not has_srcset then 990 + Message_collector.add_typed collector 991 + (`Srcset `Sizes_without_srcset) 992 + | _ -> () (* Other elements *) 998 993 999 - let end_element _state ~name:_ ~namespace:_ _collector = () 994 + let end_element _state ~tag:_ _collector = () 1000 995 let characters _state _text _collector = () 1001 996 let end_document _state _collector = () 1002 997
+8 -5
lib/htmlrw_check/specialized/svg_checker.ml
··· 30 30 state.fecomponenttransfer_stack <- [] 31 31 32 32 (* SVG namespace - the DOM stores this as "svg" shorthand *) 33 - let svg_ns = "svg" 33 + let _svg_ns = "svg" 34 34 35 35 (* Full SVG namespace URL for validation *) 36 36 let svg_ns_url = "http://www.w3.org/2000/svg" ··· 348 348 end 349 349 with Not_found -> () 350 350 351 - let start_element state ~name ~namespace ~attrs collector = 352 - let is_svg_element = namespace = Some svg_ns in 351 + let start_element state ~element collector = 352 + let is_svg_element = match element.Element.tag with Tag.Svg _ -> true | _ -> false in 353 + let name = Tag.tag_to_string element.tag in 354 + let attrs = element.raw_attrs in 353 355 354 356 (* Track if we're in SVG context *) 355 357 if name = "svg" && is_svg_element then ··· 448 450 | None -> ()) 449 451 end 450 452 451 - let end_element state ~name ~namespace collector = 452 - let is_svg_element = namespace = Some svg_ns in 453 + let end_element state ~tag collector = 454 + let is_svg_element = match tag with Tag.Svg _ -> true | _ -> false in 455 + let name = Tag.tag_to_string tag in 453 456 454 457 if is_svg_element || state.in_svg then begin 455 458 let name_lower = String.lowercase_ascii name in
+42 -41
lib/htmlrw_check/specialized/table_checker.ml
··· 688 688 689 689 let reset state = state.tables := [] 690 690 691 - let is_html_namespace = function 691 + let _is_html_namespace = function 692 692 | None -> true (* HTML mode - no namespace specified *) 693 693 | Some ns -> ns = html_ns (* XHTML mode - check namespace *) 694 694 695 - let start_element state ~name ~namespace ~attrs collector = 696 - if is_html_namespace namespace then ( 697 - let name_lower = String.lowercase_ascii name in 698 - match name_lower with 699 - | "table" -> 700 - (* Push a new table onto the stack *) 701 - state.tables := make_table () :: !(state.tables) 702 - | _ -> ( 703 - match !(state.tables) with 704 - | [] -> () 705 - | table :: _ -> ( 706 - match name_lower with 707 - | "td" -> start_cell table false attrs collector 708 - | "th" -> start_cell table true attrs collector 709 - | "tr" -> start_row table collector 710 - | "tbody" | "thead" | "tfoot" -> start_row_group table name collector 711 - | "col" -> start_col table attrs collector 712 - | "colgroup" -> start_colgroup table attrs collector 713 - | _ -> ()))) 695 + let start_element state ~element collector = 696 + let attrs = element.Element.raw_attrs in 697 + match element.tag with 698 + | Tag.Html `Table -> 699 + (* Push a new table onto the stack *) 700 + state.tables := make_table () :: !(state.tables) 701 + | Tag.Html tag -> ( 702 + match !(state.tables) with 703 + | [] -> () 704 + | table :: _ -> ( 705 + match tag with 706 + | `Td -> start_cell table false attrs collector 707 + | `Th -> start_cell table true attrs collector 708 + | `Tr -> start_row table collector 709 + | `Tbody | `Thead | `Tfoot -> 710 + let name = Tag.html_tag_to_string tag in 711 + start_row_group table name collector 712 + | `Col -> start_col table attrs collector 713 + | `Colgroup -> start_colgroup table attrs collector 714 + | _ -> ())) 715 + | _ -> () (* Non-HTML elements *) 714 716 715 - let end_element state ~name ~namespace collector = 716 - if is_html_namespace namespace then ( 717 - let name_lower = String.lowercase_ascii name in 718 - match name_lower with 719 - | "table" -> ( 720 - match !(state.tables) with 721 - | [] -> () (* End tag without start - ignore *) 722 - | table :: rest -> 723 - end_table table collector; 724 - state.tables := rest) 725 - | _ -> ( 726 - match !(state.tables) with 727 - | [] -> () 728 - | table :: _ -> ( 729 - match name_lower with 730 - | "td" | "th" -> end_cell table 731 - | "tr" -> end_row table collector 732 - | "tbody" | "thead" | "tfoot" -> end_row_group_handler table collector 733 - | "col" -> end_col table 734 - | "colgroup" -> end_colgroup table 735 - | _ -> ()))) 717 + let end_element state ~tag collector = 718 + match tag with 719 + | Tag.Html `Table -> ( 720 + match !(state.tables) with 721 + | [] -> () (* End tag without start - ignore *) 722 + | table :: rest -> 723 + end_table table collector; 724 + state.tables := rest) 725 + | Tag.Html html_tag -> ( 726 + match !(state.tables) with 727 + | [] -> () 728 + | table :: _ -> ( 729 + match html_tag with 730 + | `Td | `Th -> end_cell table 731 + | `Tr -> end_row table collector 732 + | `Tbody | `Thead | `Tfoot -> end_row_group_handler table collector 733 + | `Col -> end_col table 734 + | `Colgroup -> end_colgroup table 735 + | _ -> ())) 736 + | _ -> () (* Non-HTML elements *) 736 737 737 738 let characters _state _text _collector = () 738 739
+26 -43
lib/htmlrw_check/specialized/title_checker.ml
··· 26 26 state.title_depth <- 0; 27 27 state.is_iframe_srcdoc <- false 28 28 29 - let start_element state ~name ~namespace ~attrs collector = 30 - ignore (collector, attrs); 31 - if namespace <> None then () 32 - else begin 33 - let name_lower = String.lowercase_ascii name in 34 - match name_lower with 35 - | "html" -> 36 - (* Check if this is an iframe srcdoc - title is not required *) 37 - (* We detect this by checking for srcdoc context - not directly checkable from HTML, 38 - but we can assume normal HTML document for now *) 39 - () 40 - | "head" -> 41 - state.in_head <- true 42 - | "title" when state.in_head -> 43 - state.has_title <- true; 44 - state.in_title <- true; 45 - state.title_has_content <- false; 46 - state.title_depth <- 0 47 - | _ -> () 48 - end; 29 + let start_element state ~element _collector = 30 + (match element.Element.tag with 31 + | Tag.Html `Html -> () 32 + | Tag.Html `Head -> 33 + state.in_head <- true 34 + | Tag.Html `Title when state.in_head -> 35 + state.has_title <- true; 36 + state.in_title <- true; 37 + state.title_has_content <- false; 38 + state.title_depth <- 0 39 + | _ -> ()); 49 40 if state.in_title then 50 41 state.title_depth <- state.title_depth + 1 51 42 52 - let end_element state ~name ~namespace collector = 53 - if namespace <> None then () 54 - else begin 55 - let name_lower = String.lowercase_ascii name in 56 - 57 - if state.in_title then 58 - state.title_depth <- state.title_depth - 1; 59 - 60 - match name_lower with 61 - | "title" when state.in_title && state.title_depth = 0 -> 62 - (* Check if title was empty *) 63 - if not state.title_has_content then 64 - Message_collector.add_typed collector 65 - (`Element (`Must_not_be_empty (`Elem "title"))); 66 - state.in_title <- false 67 - | "head" -> 68 - (* Check if head had a title element *) 69 - if state.in_head && not state.has_title then 70 - Message_collector.add_typed collector 71 - (`Element (`Missing_child (`Parent "head", `Child "title"))); 72 - state.in_head <- false 73 - | _ -> () 74 - end 43 + let end_element state ~tag collector = 44 + if state.in_title then 45 + state.title_depth <- state.title_depth - 1; 46 + match tag with 47 + | Tag.Html `Title when state.in_title && state.title_depth = 0 -> 48 + if not state.title_has_content then 49 + Message_collector.add_typed collector 50 + (`Element (`Must_not_be_empty (`Elem "title"))); 51 + state.in_title <- false 52 + | Tag.Html `Head -> 53 + if state.in_head && not state.has_title then 54 + Message_collector.add_typed collector 55 + (`Element (`Missing_child (`Parent "head", `Child "title"))); 56 + state.in_head <- false 57 + | _ -> () 75 58 76 59 let characters state text _collector = 77 60 if state.in_title then begin
+24 -26
lib/htmlrw_check/specialized/unknown_element_checker.ml
··· 67 67 let reset state = 68 68 state.stack <- [] 69 69 70 - let start_element state ~name ~namespace ~attrs:_ collector = 71 - (* Only check HTML namespace elements *) 72 - match namespace with 73 - | Some _ -> () (* Skip SVG, MathML, etc. *) 74 - | None -> 75 - let name_lower = String.lowercase_ascii name in 70 + let start_element state ~element collector = 71 + match element.Element.tag with 72 + | Tag.Unknown name -> 73 + (* Get the parent element name *) 74 + let parent = match state.stack with 75 + | p :: _ -> p 76 + | [] -> "document" 77 + in 78 + (* Produce error: unknown element not allowed as child *) 79 + Message_collector.add_typed collector 80 + (`Element (`Not_allowed_as_child (`Child name, `Parent parent))); 81 + (* Push to stack for tracking *) 82 + state.stack <- name :: state.stack 76 83 77 - (* Check if element is unknown *) 78 - if not (is_known_element name_lower) then begin 79 - (* Get the parent element name *) 80 - let parent = match state.stack with 81 - | p :: _ -> p 82 - | [] -> "document" 83 - in 84 - (* Produce error: unknown element not allowed as child *) 85 - Message_collector.add_typed collector 86 - (`Element (`Not_allowed_as_child (`Child name, `Parent parent))) 87 - end; 84 + | Tag.Html tag -> 85 + let name_lower = String.lowercase_ascii (Tag.tag_to_string (Tag.Html tag)) in 86 + state.stack <- name_lower :: state.stack 88 87 89 - (* Always push to stack for tracking *) 90 - state.stack <- name_lower :: state.stack 88 + | _ -> () (* SVG, MathML, Custom elements are allowed *) 91 89 92 - let end_element state ~name:_ ~namespace _ = 93 - match namespace with 94 - | Some _ -> () 95 - | None -> 96 - match state.stack with 97 - | _ :: rest -> state.stack <- rest 98 - | [] -> () (* Stack underflow - shouldn't happen *) 90 + let end_element state ~tag _ = 91 + match tag with 92 + | Tag.Html _ | Tag.Unknown _ -> 93 + (match state.stack with 94 + | _ :: rest -> state.stack <- rest 95 + | [] -> ()) (* Stack underflow - shouldn't happen *) 96 + | _ -> () (* SVG, MathML, Custom elements *) 99 97 100 98 let characters _state _text _collector = () 101 99
+7 -5
lib/htmlrw_check/specialized/url_checker.ml
··· 741 741 if String.lowercase_ascii k = String.lowercase_ascii name then Some v else None 742 742 ) attrs 743 743 744 - let start_element _state ~name ~namespace ~attrs collector = 745 - if namespace <> None then () 746 - else begin 744 + let start_element _state ~element collector = 745 + match element.Element.tag with 746 + | Tag.Html _ -> 747 + let name = Tag.tag_to_string element.tag in 747 748 let name_lower = String.lowercase_ascii name in 749 + let attrs = element.raw_attrs in 748 750 (* Check URL attributes for elements that have them *) 749 751 (match List.assoc_opt name_lower url_attributes with 750 752 | None -> () ··· 808 810 | Some warn_msg -> Message_collector.add_typed collector (`Generic warn_msg) 809 811 | None -> ()) 810 812 | _ -> ()) 811 - end 813 + | _ -> () (* Non-HTML elements *) 812 814 813 - let end_element _state ~name:_ ~namespace:_ _collector = () 815 + let end_element _state ~tag:_ _collector = () 814 816 let characters _state _text _collector = () 815 817 let end_document _state _collector = () 816 818
+5 -4
lib/htmlrw_check/specialized/xhtml_content_checker.ml
··· 52 52 Message_collector.add_typed collector (`Attr `Data_uppercase) 53 53 ) attrs 54 54 55 - let start_element state ~name ~namespace ~attrs collector = 56 - ignore namespace; 55 + let start_element state ~element collector = 56 + let name = Tag.tag_to_string element.Element.tag in 57 57 let name_lower = String.lowercase_ascii name in 58 + let attrs = element.raw_attrs in 58 59 59 60 (* Check data-* attributes for uppercase *) 60 61 check_data_attr_case attrs collector; ··· 97 98 (* Push onto stack *) 98 99 state.element_stack <- name :: state.element_stack 99 100 100 - let end_element state ~name ~namespace:_ _collector = 101 - let name_lower = String.lowercase_ascii name in 101 + let end_element state ~tag _collector = 102 + let name_lower = String.lowercase_ascii (Tag.tag_to_string tag) in 102 103 (* Pop figure state if leaving a figure *) 103 104 if name_lower = "figure" then begin 104 105 match state.figure_stack with