OCaml HTML5 parser/serialiser based on Python's JustHTML

fix

Changed files
+96 -26
lib
+2 -2
lib/htmlrw_check/element/tag.ml
··· 256 256 let tag_of_string ?namespace name = 257 257 let name_lower = String.lowercase_ascii name in 258 258 match namespace with 259 - | Some ns when is_svg_namespace ns -> Svg name_lower 260 - | Some ns when is_mathml_namespace ns -> MathML name_lower 259 + | Some ns when is_svg_namespace ns -> Svg name (* Preserve original case for SVG *) 260 + | Some ns when is_mathml_namespace ns -> MathML name (* Preserve original case for MathML *) 261 261 | Some _ -> Unknown name_lower (* Unknown namespace *) 262 262 | None -> 263 263 match html_tag_of_string_opt name_lower with
+12
lib/htmlrw_check/error_code.ml
··· 61 61 | `Unrecognized_role of [`Token of string] 62 62 | `Tab_without_tabpanel 63 63 | `Multiple_main 64 + | `Accessible_name_prohibited of [`Attr of string] * [`Elem of string] 64 65 ] 65 66 66 67 type li_role_error = [ ··· 257 258 | `Aria (`Unrecognized_role _) -> "unrecognized-role" 258 259 | `Aria `Tab_without_tabpanel -> "tab-without-tabpanel" 259 260 | `Aria `Multiple_main -> "multiple-main" 261 + | `Aria (`Accessible_name_prohibited _) -> "aria-not-allowed" 260 262 261 263 (* List item role errors *) 262 264 | `Li_role `Div_in_dl_bad_role -> "invalid-role" ··· 491 493 | `Aria `Multiple_main -> 492 494 Printf.sprintf "A document should not include more than one visible element with %s." 493 495 (q "role=main") 496 + | `Aria (`Accessible_name_prohibited (`Attr attr, `Elem element)) -> 497 + (* Roles that prohibit accessible names - defined by ARIA spec *) 498 + let prohibited_roles = [ 499 + "caption"; "code"; "deletion"; "emphasis"; "generic"; "insertion"; 500 + "paragraph"; "presentation"; "strong"; "subscript"; "superscript" 501 + ] in 502 + let roles_str = String.concat ", " (List.map q (List.rev (List.tl (List.rev prohibited_roles)))) ^ 503 + ", or " ^ q (List.hd (List.rev prohibited_roles)) in 504 + Printf.sprintf "The %s attribute must not be specified on any %s element unless the element has a %s value other than %s." 505 + (q attr) (q element) (q "role") roles_str 494 506 495 507 (* List item role errors *) 496 508 | `Li_role `Div_in_dl_bad_role ->
+6
lib/htmlrw_check/error_code.mli
··· 312 312 (** Document has multiple visible main landmarks. 313 313 Only one visible [role="main"] or [<main>] should exist 314 314 per document for proper landmark navigation. *) 315 + 316 + | `Accessible_name_prohibited of [`Attr of string] * [`Elem of string] 317 + (** Accessible name attribute not allowed on element with generic role. 318 + Elements with implicit [role="generic"] (or no role) cannot have 319 + [aria-label], [aria-labelledby], or [aria-braillelabel] unless 320 + they have an explicit role that supports accessible names. *) 315 321 ] 316 322 317 323 (** List item role constraint errors.
+58 -18
lib/htmlrw_check/specialized/aria_checker.ml
··· 1 1 (** ARIA validation checker implementation. *) 2 2 3 + (** Quote helper for consistent message formatting. *) 4 + let q = Error_code.q 5 + 3 6 (** Valid WAI-ARIA 1.2 roles. 4 7 5 8 These are all the valid role values according to the WAI-ARIA 1.2 ··· 422 425 let render_role_set roles = 423 426 match roles with 424 427 | [] -> "" 425 - | [role] -> "\"" ^ role ^ "\"" 428 + | [role] -> q role 426 429 | _ -> 427 - let quoted = List.map (fun r -> "\"" ^ r ^ "\"") roles in 430 + let quoted = List.map q roles in 428 431 String.concat " or " quoted 429 432 430 433 let start_element state ~element collector = ··· 505 508 (* Generate error if element cannot have accessible name but has one *) 506 509 if has_aria_label && not can_have_name then 507 510 Message_collector.add_typed collector 508 - (`Aria (`Must_not_specify (`Attr "aria-label", `Elem name, 509 - `Condition "the element has a \xe2\x80\x9crole\xe2\x80\x9d value other than \xe2\x80\x9ccaption\xe2\x80\x9d, \xe2\x80\x9ccode\xe2\x80\x9d, \xe2\x80\x9cdeletion\xe2\x80\x9d, \xe2\x80\x9cemphasis\xe2\x80\x9d, \xe2\x80\x9cgeneric\xe2\x80\x9d, \xe2\x80\x9cinsertion\xe2\x80\x9d, \xe2\x80\x9cparagraph\xe2\x80\x9d, \xe2\x80\x9cpresentation\xe2\x80\x9d, \xe2\x80\x9cstrong\xe2\x80\x9d, \xe2\x80\x9csubscript\xe2\x80\x9d, or \xe2\x80\x9csuperscript\xe2\x80\x9d"))); 511 + (`Aria (`Accessible_name_prohibited (`Attr "aria-label", `Elem name))); 510 512 511 513 if has_aria_labelledby && not can_have_name then 512 514 Message_collector.add_typed collector 513 - (`Aria (`Must_not_specify (`Attr "aria-labelledby", `Elem name, 514 - `Condition "the element has a \xe2\x80\x9crole\xe2\x80\x9d value other than \xe2\x80\x9ccaption\xe2\x80\x9d, \xe2\x80\x9ccode\xe2\x80\x9d, \xe2\x80\x9cdeletion\xe2\x80\x9d, \xe2\x80\x9cemphasis\xe2\x80\x9d, \xe2\x80\x9cgeneric\xe2\x80\x9d, \xe2\x80\x9cinsertion\xe2\x80\x9d, \xe2\x80\x9cparagraph\xe2\x80\x9d, \xe2\x80\x9cpresentation\xe2\x80\x9d, \xe2\x80\x9cstrong\xe2\x80\x9d, \xe2\x80\x9csubscript\xe2\x80\x9d, or \xe2\x80\x9csuperscript\xe2\x80\x9d"))); 515 + (`Aria (`Accessible_name_prohibited (`Attr "aria-labelledby", `Elem name))); 515 516 516 517 if has_aria_braillelabel && not can_have_name then 517 518 Message_collector.add_typed collector 518 - (`Aria (`Must_not_specify (`Attr "aria-braillelabel", `Elem name, 519 - `Condition "the element has a \xe2\x80\x9crole\xe2\x80\x9d value other than \xe2\x80\x9ccaption\xe2\x80\x9d, \xe2\x80\x9ccode\xe2\x80\x9d, \xe2\x80\x9cdeletion\xe2\x80\x9d, \xe2\x80\x9cemphasis\xe2\x80\x9d, \xe2\x80\x9cgeneric\xe2\x80\x9d, \xe2\x80\x9cinsertion\xe2\x80\x9d, \xe2\x80\x9cparagraph\xe2\x80\x9d, \xe2\x80\x9cpresentation\xe2\x80\x9d, \xe2\x80\x9cstrong\xe2\x80\x9d, \xe2\x80\x9csubscript\xe2\x80\x9d, or \xe2\x80\x9csuperscript\xe2\x80\x9d"))); 519 + (`Aria (`Accessible_name_prohibited (`Attr "aria-braillelabel", `Elem name))); 520 520 521 521 (* Check for img with empty alt having role attribute *) 522 522 if name_lower = "img" then begin ··· 616 616 | None -> "text" 617 617 in 618 618 if not has_list && input_type = "text" then 619 - "for an \xe2\x80\x9cinput\xe2\x80\x9d element that has no \xe2\x80\x9clist\xe2\x80\x9d attribute and whose type is \xe2\x80\x9ctext\xe2\x80\x9d" 619 + Printf.sprintf "for an %s element that has no %s attribute and whose type is %s" 620 + (q "input") (q "list") (q "text") 620 621 else 621 - Printf.sprintf "for element \xe2\x80\x9c%s\xe2\x80\x9d" name 622 + Printf.sprintf "for element %s" (q name) 622 623 end else 623 - Printf.sprintf "for element \xe2\x80\x9c%s\xe2\x80\x9d" name 624 + Printf.sprintf "for element %s" (q name) 624 625 in 625 626 Message_collector.add_typed collector 626 627 (`Aria (`Unnecessary_role (`Role first_role, `Elem name, `Reason reason))) ··· 644 645 if Hashtbl.mem roles_which_cannot_be_named role && has_accessible_name then 645 646 Message_collector.add_typed collector 646 647 (`Generic (Printf.sprintf 647 - "Elements with role=\"%s\" must not have accessible names (via aria-label or aria-labelledby)." 648 - role)); 648 + "Elements with %s must not have accessible names (via aria-label or aria-labelledby)." 649 + (q ("role=" ^ role)))); 649 650 650 651 (* Check for required ancestor roles *) 651 652 begin match Hashtbl.find_opt required_role_ancestor_by_descendant role with ··· 653 654 if not (has_required_ancestor_role state required_ancestors) then 654 655 Message_collector.add_typed collector 655 656 (`Generic (Printf.sprintf 656 - "An element with \"role=%s\" must be contained in, or owned by, an element with the \"role\" value %s." 657 - role 657 + "An element with %s must be contained in, or owned by, an element with the %s value %s." 658 + (q ("role=" ^ role)) 659 + (q "role") 658 660 (render_role_set required_ancestors))) 659 661 | None -> () 660 662 end; ··· 682 684 if value_lower = default_value then 683 685 Message_collector.add_typed collector 684 686 (`Generic (Printf.sprintf 685 - "The \xe2\x80\x9c%s\xe2\x80\x9d attribute is unnecessary for the value \xe2\x80\x9c%s\xe2\x80\x9d." 686 - attr_name attr_value)) 687 + "The %s attribute is unnecessary for the value %s." 688 + (q attr_name) (q attr_value))) 687 689 | None -> () 688 690 ) attrs; 689 691 ··· 724 726 implicit_role; 725 727 } in 726 728 state.stack <- node :: state.stack 727 - | _ -> () (* Skip non-HTML elements *) 729 + 730 + | Tag.Custom name -> 731 + (* Custom elements (autonomous custom elements) have generic role by default 732 + and cannot have accessible names unless they have an explicit role *) 733 + let attrs = element.raw_attrs in 734 + let role_attr = List.assoc_opt "role" attrs in 735 + let aria_label = List.assoc_opt "aria-label" attrs in 736 + let aria_labelledby = List.assoc_opt "aria-labelledby" attrs in 737 + let aria_braillelabel = List.assoc_opt "aria-braillelabel" attrs in 738 + let has_aria_label = match aria_label with Some v -> String.trim v <> "" | None -> false in 739 + let has_aria_labelledby = match aria_labelledby with Some v -> String.trim v <> "" | None -> false in 740 + let has_aria_braillelabel = match aria_braillelabel with Some v -> String.trim v <> "" | None -> false in 741 + 742 + (* Parse explicit roles from role attribute *) 743 + let explicit_roles = match role_attr with 744 + | Some role_value -> split_roles role_value 745 + | None -> [] 746 + in 747 + 748 + (* Custom elements have no implicit role (generic) *) 749 + let implicit_role = None in 750 + 751 + (* Check if element can have accessible names *) 752 + let can_have_name = element_can_have_accessible_name name explicit_roles implicit_role in 753 + 754 + (* Generate error if element cannot have accessible name but has one *) 755 + if has_aria_label && not can_have_name then 756 + Message_collector.add_typed collector 757 + (`Aria (`Accessible_name_prohibited (`Attr "aria-label", `Elem name))); 758 + 759 + if has_aria_labelledby && not can_have_name then 760 + Message_collector.add_typed collector 761 + (`Aria (`Accessible_name_prohibited (`Attr "aria-labelledby", `Elem name))); 762 + 763 + if has_aria_braillelabel && not can_have_name then 764 + Message_collector.add_typed collector 765 + (`Aria (`Accessible_name_prohibited (`Attr "aria-braillelabel", `Elem name))) 766 + 767 + | _ -> () (* Skip SVG, MathML, Unknown elements *) 728 768 729 769 let end_element state ~tag _collector = 730 770 (* Only process HTML elements *)
+18 -6
lib/htmlrw_check/specialized/svg_checker.ml
··· 8 8 9 9 type fecomponenttransfer_state = { 10 10 mutable seen_funcs : string list; (* track feFuncR, feFuncG, etc. *) 11 + mutable duplicate_error_reported : bool; (* suppress further duplicate errors *) 11 12 } 12 13 13 14 type state = { ··· 366 367 | parent :: _ when String.lowercase_ascii parent = "a" -> 367 368 if List.mem name_lower a_disallowed_children then 368 369 Message_collector.add_typed collector 369 - (`Element (`Not_allowed_as_child (`Child name_lower, `Parent "a"))) 370 + (`Element (`Not_allowed_as_child (`Child name, `Parent "a"))) 370 371 | _ -> ()); 371 372 372 373 (* 2. Track missing-glyph in font *) ··· 402 403 if List.mem name_lower ["fefuncr"; "fefuncg"; "fefuncb"; "fefunca"] then begin 403 404 match state.fecomponenttransfer_stack with 404 405 | fect :: _ -> 405 - if List.mem name_lower fect.seen_funcs then 406 - Message_collector.add_typed collector 407 - (`Element (`Not_allowed_as_child (`Child name, `Parent "feComponentTransfer"))) 408 - else 406 + if List.mem name_lower fect.seen_funcs then begin 407 + (* Only report first duplicate error, suppress further *) 408 + if not fect.duplicate_error_reported then begin 409 + Message_collector.add_typed collector 410 + (`Element (`Not_allowed_as_child (`Child name, `Parent "feComponentTransfer"))); 411 + fect.duplicate_error_reported <- true 412 + end 413 + end else 409 414 fect.seen_funcs <- name_lower :: fect.seen_funcs 410 415 | [] -> () 411 416 end ··· 415 420 if name_lower = "font" then 416 421 state.font_stack <- { has_missing_glyph = false } :: state.font_stack; 417 422 if name_lower = "fecomponenttransfer" then 418 - state.fecomponenttransfer_stack <- { seen_funcs = [] } :: state.fecomponenttransfer_stack; 423 + state.fecomponenttransfer_stack <- { seen_funcs = []; duplicate_error_reported = false } :: state.fecomponenttransfer_stack; 424 + 425 + (* Check feConvolveMatrix requires order attribute *) 426 + if name_lower = "feconvolvematrix" then begin 427 + if not (Attr_utils.has_attr "order" attrs) then 428 + Message_collector.add_typed collector 429 + (`Svg (`Missing_attr (`Elem "feConvolveMatrix", `Attr "order"))) 430 + end; 419 431 420 432 state.element_stack <- name :: state.element_stack; 421 433