OCaml HTML5 parser/serialiser based on Python's JustHTML

fix

Changed files
+96 -26
lib
+2 -2
lib/htmlrw_check/element/tag.ml
··· 256 let tag_of_string ?namespace name = 257 let name_lower = String.lowercase_ascii name in 258 match namespace with 259 - | Some ns when is_svg_namespace ns -> Svg name_lower 260 - | Some ns when is_mathml_namespace ns -> MathML name_lower 261 | Some _ -> Unknown name_lower (* Unknown namespace *) 262 | None -> 263 match html_tag_of_string_opt name_lower with
··· 256 let tag_of_string ?namespace name = 257 let name_lower = String.lowercase_ascii name in 258 match namespace with 259 + | Some ns when is_svg_namespace ns -> Svg name (* Preserve original case for SVG *) 260 + | Some ns when is_mathml_namespace ns -> MathML name (* Preserve original case for MathML *) 261 | Some _ -> Unknown name_lower (* Unknown namespace *) 262 | None -> 263 match html_tag_of_string_opt name_lower with
+12
lib/htmlrw_check/error_code.ml
··· 61 | `Unrecognized_role of [`Token of string] 62 | `Tab_without_tabpanel 63 | `Multiple_main 64 ] 65 66 type li_role_error = [ ··· 257 | `Aria (`Unrecognized_role _) -> "unrecognized-role" 258 | `Aria `Tab_without_tabpanel -> "tab-without-tabpanel" 259 | `Aria `Multiple_main -> "multiple-main" 260 261 (* List item role errors *) 262 | `Li_role `Div_in_dl_bad_role -> "invalid-role" ··· 491 | `Aria `Multiple_main -> 492 Printf.sprintf "A document should not include more than one visible element with %s." 493 (q "role=main") 494 495 (* List item role errors *) 496 | `Li_role `Div_in_dl_bad_role ->
··· 61 | `Unrecognized_role of [`Token of string] 62 | `Tab_without_tabpanel 63 | `Multiple_main 64 + | `Accessible_name_prohibited of [`Attr of string] * [`Elem of string] 65 ] 66 67 type li_role_error = [ ··· 258 | `Aria (`Unrecognized_role _) -> "unrecognized-role" 259 | `Aria `Tab_without_tabpanel -> "tab-without-tabpanel" 260 | `Aria `Multiple_main -> "multiple-main" 261 + | `Aria (`Accessible_name_prohibited _) -> "aria-not-allowed" 262 263 (* List item role errors *) 264 | `Li_role `Div_in_dl_bad_role -> "invalid-role" ··· 493 | `Aria `Multiple_main -> 494 Printf.sprintf "A document should not include more than one visible element with %s." 495 (q "role=main") 496 + | `Aria (`Accessible_name_prohibited (`Attr attr, `Elem element)) -> 497 + (* Roles that prohibit accessible names - defined by ARIA spec *) 498 + let prohibited_roles = [ 499 + "caption"; "code"; "deletion"; "emphasis"; "generic"; "insertion"; 500 + "paragraph"; "presentation"; "strong"; "subscript"; "superscript" 501 + ] in 502 + let roles_str = String.concat ", " (List.map q (List.rev (List.tl (List.rev prohibited_roles)))) ^ 503 + ", or " ^ q (List.hd (List.rev prohibited_roles)) in 504 + Printf.sprintf "The %s attribute must not be specified on any %s element unless the element has a %s value other than %s." 505 + (q attr) (q element) (q "role") roles_str 506 507 (* List item role errors *) 508 | `Li_role `Div_in_dl_bad_role ->
+6
lib/htmlrw_check/error_code.mli
··· 312 (** Document has multiple visible main landmarks. 313 Only one visible [role="main"] or [<main>] should exist 314 per document for proper landmark navigation. *) 315 ] 316 317 (** List item role constraint errors.
··· 312 (** Document has multiple visible main landmarks. 313 Only one visible [role="main"] or [<main>] should exist 314 per document for proper landmark navigation. *) 315 + 316 + | `Accessible_name_prohibited of [`Attr of string] * [`Elem of string] 317 + (** Accessible name attribute not allowed on element with generic role. 318 + Elements with implicit [role="generic"] (or no role) cannot have 319 + [aria-label], [aria-labelledby], or [aria-braillelabel] unless 320 + they have an explicit role that supports accessible names. *) 321 ] 322 323 (** List item role constraint errors.
+58 -18
lib/htmlrw_check/specialized/aria_checker.ml
··· 1 (** ARIA validation checker implementation. *) 2 3 (** Valid WAI-ARIA 1.2 roles. 4 5 These are all the valid role values according to the WAI-ARIA 1.2 ··· 422 let render_role_set roles = 423 match roles with 424 | [] -> "" 425 - | [role] -> "\"" ^ role ^ "\"" 426 | _ -> 427 - let quoted = List.map (fun r -> "\"" ^ r ^ "\"") roles in 428 String.concat " or " quoted 429 430 let start_element state ~element collector = ··· 505 (* Generate error if element cannot have accessible name but has one *) 506 if has_aria_label && not can_have_name then 507 Message_collector.add_typed collector 508 - (`Aria (`Must_not_specify (`Attr "aria-label", `Elem name, 509 - `Condition "the element has a \xe2\x80\x9crole\xe2\x80\x9d value other than \xe2\x80\x9ccaption\xe2\x80\x9d, \xe2\x80\x9ccode\xe2\x80\x9d, \xe2\x80\x9cdeletion\xe2\x80\x9d, \xe2\x80\x9cemphasis\xe2\x80\x9d, \xe2\x80\x9cgeneric\xe2\x80\x9d, \xe2\x80\x9cinsertion\xe2\x80\x9d, \xe2\x80\x9cparagraph\xe2\x80\x9d, \xe2\x80\x9cpresentation\xe2\x80\x9d, \xe2\x80\x9cstrong\xe2\x80\x9d, \xe2\x80\x9csubscript\xe2\x80\x9d, or \xe2\x80\x9csuperscript\xe2\x80\x9d"))); 510 511 if has_aria_labelledby && not can_have_name then 512 Message_collector.add_typed collector 513 - (`Aria (`Must_not_specify (`Attr "aria-labelledby", `Elem name, 514 - `Condition "the element has a \xe2\x80\x9crole\xe2\x80\x9d value other than \xe2\x80\x9ccaption\xe2\x80\x9d, \xe2\x80\x9ccode\xe2\x80\x9d, \xe2\x80\x9cdeletion\xe2\x80\x9d, \xe2\x80\x9cemphasis\xe2\x80\x9d, \xe2\x80\x9cgeneric\xe2\x80\x9d, \xe2\x80\x9cinsertion\xe2\x80\x9d, \xe2\x80\x9cparagraph\xe2\x80\x9d, \xe2\x80\x9cpresentation\xe2\x80\x9d, \xe2\x80\x9cstrong\xe2\x80\x9d, \xe2\x80\x9csubscript\xe2\x80\x9d, or \xe2\x80\x9csuperscript\xe2\x80\x9d"))); 515 516 if has_aria_braillelabel && not can_have_name then 517 Message_collector.add_typed collector 518 - (`Aria (`Must_not_specify (`Attr "aria-braillelabel", `Elem name, 519 - `Condition "the element has a \xe2\x80\x9crole\xe2\x80\x9d value other than \xe2\x80\x9ccaption\xe2\x80\x9d, \xe2\x80\x9ccode\xe2\x80\x9d, \xe2\x80\x9cdeletion\xe2\x80\x9d, \xe2\x80\x9cemphasis\xe2\x80\x9d, \xe2\x80\x9cgeneric\xe2\x80\x9d, \xe2\x80\x9cinsertion\xe2\x80\x9d, \xe2\x80\x9cparagraph\xe2\x80\x9d, \xe2\x80\x9cpresentation\xe2\x80\x9d, \xe2\x80\x9cstrong\xe2\x80\x9d, \xe2\x80\x9csubscript\xe2\x80\x9d, or \xe2\x80\x9csuperscript\xe2\x80\x9d"))); 520 521 (* Check for img with empty alt having role attribute *) 522 if name_lower = "img" then begin ··· 616 | None -> "text" 617 in 618 if not has_list && input_type = "text" then 619 - "for an \xe2\x80\x9cinput\xe2\x80\x9d element that has no \xe2\x80\x9clist\xe2\x80\x9d attribute and whose type is \xe2\x80\x9ctext\xe2\x80\x9d" 620 else 621 - Printf.sprintf "for element \xe2\x80\x9c%s\xe2\x80\x9d" name 622 end else 623 - Printf.sprintf "for element \xe2\x80\x9c%s\xe2\x80\x9d" name 624 in 625 Message_collector.add_typed collector 626 (`Aria (`Unnecessary_role (`Role first_role, `Elem name, `Reason reason))) ··· 644 if Hashtbl.mem roles_which_cannot_be_named role && has_accessible_name then 645 Message_collector.add_typed collector 646 (`Generic (Printf.sprintf 647 - "Elements with role=\"%s\" must not have accessible names (via aria-label or aria-labelledby)." 648 - role)); 649 650 (* Check for required ancestor roles *) 651 begin match Hashtbl.find_opt required_role_ancestor_by_descendant role with ··· 653 if not (has_required_ancestor_role state required_ancestors) then 654 Message_collector.add_typed collector 655 (`Generic (Printf.sprintf 656 - "An element with \"role=%s\" must be contained in, or owned by, an element with the \"role\" value %s." 657 - role 658 (render_role_set required_ancestors))) 659 | None -> () 660 end; ··· 682 if value_lower = default_value then 683 Message_collector.add_typed collector 684 (`Generic (Printf.sprintf 685 - "The \xe2\x80\x9c%s\xe2\x80\x9d attribute is unnecessary for the value \xe2\x80\x9c%s\xe2\x80\x9d." 686 - attr_name attr_value)) 687 | None -> () 688 ) attrs; 689 ··· 724 implicit_role; 725 } in 726 state.stack <- node :: state.stack 727 - | _ -> () (* Skip non-HTML elements *) 728 729 let end_element state ~tag _collector = 730 (* Only process HTML elements *)
··· 1 (** ARIA validation checker implementation. *) 2 3 + (** Quote helper for consistent message formatting. *) 4 + let q = Error_code.q 5 + 6 (** Valid WAI-ARIA 1.2 roles. 7 8 These are all the valid role values according to the WAI-ARIA 1.2 ··· 425 let render_role_set roles = 426 match roles with 427 | [] -> "" 428 + | [role] -> q role 429 | _ -> 430 + let quoted = List.map q roles in 431 String.concat " or " quoted 432 433 let start_element state ~element collector = ··· 508 (* Generate error if element cannot have accessible name but has one *) 509 if has_aria_label && not can_have_name then 510 Message_collector.add_typed collector 511 + (`Aria (`Accessible_name_prohibited (`Attr "aria-label", `Elem name))); 512 513 if has_aria_labelledby && not can_have_name then 514 Message_collector.add_typed collector 515 + (`Aria (`Accessible_name_prohibited (`Attr "aria-labelledby", `Elem name))); 516 517 if has_aria_braillelabel && not can_have_name then 518 Message_collector.add_typed collector 519 + (`Aria (`Accessible_name_prohibited (`Attr "aria-braillelabel", `Elem name))); 520 521 (* Check for img with empty alt having role attribute *) 522 if name_lower = "img" then begin ··· 616 | None -> "text" 617 in 618 if not has_list && input_type = "text" then 619 + Printf.sprintf "for an %s element that has no %s attribute and whose type is %s" 620 + (q "input") (q "list") (q "text") 621 else 622 + Printf.sprintf "for element %s" (q name) 623 end else 624 + Printf.sprintf "for element %s" (q name) 625 in 626 Message_collector.add_typed collector 627 (`Aria (`Unnecessary_role (`Role first_role, `Elem name, `Reason reason))) ··· 645 if Hashtbl.mem roles_which_cannot_be_named role && has_accessible_name then 646 Message_collector.add_typed collector 647 (`Generic (Printf.sprintf 648 + "Elements with %s must not have accessible names (via aria-label or aria-labelledby)." 649 + (q ("role=" ^ role)))); 650 651 (* Check for required ancestor roles *) 652 begin match Hashtbl.find_opt required_role_ancestor_by_descendant role with ··· 654 if not (has_required_ancestor_role state required_ancestors) then 655 Message_collector.add_typed collector 656 (`Generic (Printf.sprintf 657 + "An element with %s must be contained in, or owned by, an element with the %s value %s." 658 + (q ("role=" ^ role)) 659 + (q "role") 660 (render_role_set required_ancestors))) 661 | None -> () 662 end; ··· 684 if value_lower = default_value then 685 Message_collector.add_typed collector 686 (`Generic (Printf.sprintf 687 + "The %s attribute is unnecessary for the value %s." 688 + (q attr_name) (q attr_value))) 689 | None -> () 690 ) attrs; 691 ··· 726 implicit_role; 727 } in 728 state.stack <- node :: state.stack 729 + 730 + | Tag.Custom name -> 731 + (* Custom elements (autonomous custom elements) have generic role by default 732 + and cannot have accessible names unless they have an explicit role *) 733 + let attrs = element.raw_attrs in 734 + let role_attr = List.assoc_opt "role" attrs in 735 + let aria_label = List.assoc_opt "aria-label" attrs in 736 + let aria_labelledby = List.assoc_opt "aria-labelledby" attrs in 737 + let aria_braillelabel = List.assoc_opt "aria-braillelabel" attrs in 738 + let has_aria_label = match aria_label with Some v -> String.trim v <> "" | None -> false in 739 + let has_aria_labelledby = match aria_labelledby with Some v -> String.trim v <> "" | None -> false in 740 + let has_aria_braillelabel = match aria_braillelabel with Some v -> String.trim v <> "" | None -> false in 741 + 742 + (* Parse explicit roles from role attribute *) 743 + let explicit_roles = match role_attr with 744 + | Some role_value -> split_roles role_value 745 + | None -> [] 746 + in 747 + 748 + (* Custom elements have no implicit role (generic) *) 749 + let implicit_role = None in 750 + 751 + (* Check if element can have accessible names *) 752 + let can_have_name = element_can_have_accessible_name name explicit_roles implicit_role in 753 + 754 + (* Generate error if element cannot have accessible name but has one *) 755 + if has_aria_label && not can_have_name then 756 + Message_collector.add_typed collector 757 + (`Aria (`Accessible_name_prohibited (`Attr "aria-label", `Elem name))); 758 + 759 + if has_aria_labelledby && not can_have_name then 760 + Message_collector.add_typed collector 761 + (`Aria (`Accessible_name_prohibited (`Attr "aria-labelledby", `Elem name))); 762 + 763 + if has_aria_braillelabel && not can_have_name then 764 + Message_collector.add_typed collector 765 + (`Aria (`Accessible_name_prohibited (`Attr "aria-braillelabel", `Elem name))) 766 + 767 + | _ -> () (* Skip SVG, MathML, Unknown elements *) 768 769 let end_element state ~tag _collector = 770 (* Only process HTML elements *)
+18 -6
lib/htmlrw_check/specialized/svg_checker.ml
··· 8 9 type fecomponenttransfer_state = { 10 mutable seen_funcs : string list; (* track feFuncR, feFuncG, etc. *) 11 } 12 13 type state = { ··· 366 | parent :: _ when String.lowercase_ascii parent = "a" -> 367 if List.mem name_lower a_disallowed_children then 368 Message_collector.add_typed collector 369 - (`Element (`Not_allowed_as_child (`Child name_lower, `Parent "a"))) 370 | _ -> ()); 371 372 (* 2. Track missing-glyph in font *) ··· 402 if List.mem name_lower ["fefuncr"; "fefuncg"; "fefuncb"; "fefunca"] then begin 403 match state.fecomponenttransfer_stack with 404 | fect :: _ -> 405 - if List.mem name_lower fect.seen_funcs then 406 - Message_collector.add_typed collector 407 - (`Element (`Not_allowed_as_child (`Child name, `Parent "feComponentTransfer"))) 408 - else 409 fect.seen_funcs <- name_lower :: fect.seen_funcs 410 | [] -> () 411 end ··· 415 if name_lower = "font" then 416 state.font_stack <- { has_missing_glyph = false } :: state.font_stack; 417 if name_lower = "fecomponenttransfer" then 418 - state.fecomponenttransfer_stack <- { seen_funcs = [] } :: state.fecomponenttransfer_stack; 419 420 state.element_stack <- name :: state.element_stack; 421
··· 8 9 type fecomponenttransfer_state = { 10 mutable seen_funcs : string list; (* track feFuncR, feFuncG, etc. *) 11 + mutable duplicate_error_reported : bool; (* suppress further duplicate errors *) 12 } 13 14 type state = { ··· 367 | parent :: _ when String.lowercase_ascii parent = "a" -> 368 if List.mem name_lower a_disallowed_children then 369 Message_collector.add_typed collector 370 + (`Element (`Not_allowed_as_child (`Child name, `Parent "a"))) 371 | _ -> ()); 372 373 (* 2. Track missing-glyph in font *) ··· 403 if List.mem name_lower ["fefuncr"; "fefuncg"; "fefuncb"; "fefunca"] then begin 404 match state.fecomponenttransfer_stack with 405 | fect :: _ -> 406 + if List.mem name_lower fect.seen_funcs then begin 407 + (* Only report first duplicate error, suppress further *) 408 + if not fect.duplicate_error_reported then begin 409 + Message_collector.add_typed collector 410 + (`Element (`Not_allowed_as_child (`Child name, `Parent "feComponentTransfer"))); 411 + fect.duplicate_error_reported <- true 412 + end 413 + end else 414 fect.seen_funcs <- name_lower :: fect.seen_funcs 415 | [] -> () 416 end ··· 420 if name_lower = "font" then 421 state.font_stack <- { has_missing_glyph = false } :: state.font_stack; 422 if name_lower = "fecomponenttransfer" then 423 + state.fecomponenttransfer_stack <- { seen_funcs = []; duplicate_error_reported = false } :: state.fecomponenttransfer_stack; 424 + 425 + (* Check feConvolveMatrix requires order attribute *) 426 + if name_lower = "feconvolvematrix" then begin 427 + if not (Attr_utils.has_attr "order" attrs) then 428 + Message_collector.add_typed collector 429 + (`Svg (`Missing_attr (`Elem "feConvolveMatrix", `Attr "order"))) 430 + end; 431 432 state.element_stack <- name :: state.element_stack; 433