OCaml HTML5 parser/serialiser based on Python's JustHTML

more

Changed files
+82 -10
lib
+19 -2
lib/html5_checker/specialized/aria_checker.ml
··· 425 425 | parent :: _ -> Some parent.element_name 426 426 | [] -> None 427 427 428 + (** Check if an li element is in a "list context": 429 + - Inside a ul/ol/menu element with no explicit role, OR 430 + - Inside any element with role=list *) 431 + let is_in_list_context state = 432 + List.exists (fun ancestor -> 433 + (* Check for role=list on any ancestor *) 434 + if List.mem "list" ancestor.explicit_roles then true 435 + (* Check for ul/ol/menu with no explicit role *) 436 + else match ancestor.element_name with 437 + | "ul" | "ol" | "menu" -> ancestor.explicit_roles = [] 438 + | _ -> false 439 + ) state.stack 440 + 428 441 (** Render a list of roles as a human-readable string. *) 429 442 let render_role_set roles = 430 443 match roles with ··· 548 561 end 549 562 end; 550 563 551 - (* Check li role restrictions in menu/menubar/tablist contexts *) 564 + (* Check li role restrictions in menu/menubar/tablist/list contexts *) 552 565 if name_lower = "li" && explicit_roles <> [] then begin 553 566 let first_role = List.hd explicit_roles in 554 567 (* none/presentation are always allowed as they remove from accessibility tree *) ··· 565 578 | Some _ -> 566 579 if first_role <> "tab" then 567 580 Message_collector.add_typed collector Error_code.Li_bad_role_in_tablist 568 - | None -> ()) 581 + | None -> 582 + (* Check if in list context (ul/ol/menu without explicit role, or role=list) *) 583 + if is_in_list_context state then 584 + if first_role <> "listitem" then 585 + Message_collector.add_typed collector Error_code.Li_bad_role_in_list) 569 586 end 570 587 end; 571 588
+26 -1
lib/html5_checker/specialized/normalization_checker.ml
··· 17 17 let normalized = normalize_nfc text in 18 18 text = normalized 19 19 20 + (** Check if a character is ASCII punctuation *) 21 + let is_ascii_punct c = 22 + let code = Char.code c in 23 + (code >= 0x21 && code <= 0x2F) || (* ! to / *) 24 + (code >= 0x3A && code <= 0x40) || (* : to @ including ? *) 25 + (code >= 0x5B && code <= 0x60) || (* [ to ` *) 26 + (code >= 0x7B && code <= 0x7E) (* { to ~ *) 27 + 28 + (** Strip trailing ASCII punctuation but keep trailing space if present before punct *) 29 + let strip_trailing_punct s = 30 + let len = String.length s in 31 + if len = 0 then s 32 + else 33 + (* Find the last non-ASCII-punct character *) 34 + let rec find_end i = 35 + if i < 0 then 0 36 + else if not (is_ascii_punct s.[i]) then i + 1 37 + else find_end (i - 1) 38 + in 39 + let end_pos = find_end (len - 1) in 40 + if end_pos = len then s 41 + else String.sub s 0 end_pos 42 + 20 43 let start_element _state ~name:_ ~namespace:_ ~attrs:_ _collector = () 21 44 22 45 let end_element _state ~name:_ ~namespace:_ _collector = () ··· 27 50 if String.length text_trimmed = 0 then () 28 51 else if not (is_nfc text_trimmed) then begin 29 52 let normalized = normalize_nfc text_trimmed in 53 + (* Strip trailing ASCII punctuation from replacement to match Nu validator *) 54 + let replacement = strip_trailing_punct normalized in 30 55 Message_collector.add_typed collector 31 - (Error_code.Not_nfc { replacement = normalized }) 56 + (Error_code.Not_nfc { replacement }) 32 57 end 33 58 34 59 let end_document _state _collector = ()
+36 -6
lib/html5_checker/specialized/srcset_sizes_checker.ml
··· 47 47 (** Split string on commas while respecting parentheses *) 48 48 let split_on_comma_respecting_parens s = split_respecting_parens ~sep:',' s 49 49 50 + (** Split on commas respecting BALANCED parentheses only (for srcset). 51 + If parens are unbalanced overall, just split on all commas. *) 52 + let split_on_comma_balanced_parens s = 53 + (* First, check if parens are balanced overall *) 54 + let opens = ref 0 and closes = ref 0 in 55 + String.iter (fun c -> if c = '(' then incr opens else if c = ')' then incr closes) s; 56 + if !opens <> !closes then 57 + (* Unbalanced parens - just split on all commas *) 58 + String.split_on_char ',' s 59 + else 60 + (* Balanced parens - respect them during split *) 61 + split_on_comma_respecting_parens s 62 + 50 63 (** Split string on spaces while respecting parentheses, filtering empty segments *) 51 64 let split_on_space_respecting_parens s = 52 65 split_respecting_parens ~sep:' ' s |> List.filter (fun s -> s <> "") ··· 730 743 | _ -> 731 744 (* Unknown descriptor - find context in srcset_value *) 732 745 let trimmed_desc = String.trim desc in 733 - (* Try to find the context: find where this descriptor appears in srcset_value *) 746 + (* Nu validator adds extra ')' after the last ')' if descriptor contains any '(' *) 747 + let found_desc = 748 + if String.contains trimmed_desc '(' then 749 + (* Find position of last ')' and insert extra ')' after it *) 750 + try 751 + let last_close = String.rindex trimmed_desc ')' in 752 + let before = String.sub trimmed_desc 0 (last_close + 1) in 753 + let after = String.sub trimmed_desc (last_close + 1) (String.length trimmed_desc - last_close - 1) in 754 + before ^ ")" ^ after 755 + with Not_found -> trimmed_desc ^ ")" 756 + else trimmed_desc 757 + in 758 + (* Try to find the context: show trailing portion ending with descriptor and comma *) 734 759 let context = 735 760 try 736 761 let pos = Str.search_forward (Str.regexp_string trimmed_desc) srcset_value 0 in 737 - (* Get the context up to and including the descriptor and the comma after *) 762 + (* Get the context ending with the descriptor and the comma after *) 738 763 let end_pos = min (pos + String.length trimmed_desc + 1) (String.length srcset_value) in 739 - let start_pos = max 0 (pos - 2) in 740 - String.trim (String.sub srcset_value start_pos (end_pos - start_pos)) 764 + (* Show trailing portion with ellipsis if needed *) 765 + let max_context = 15 in 766 + if end_pos > max_context then 767 + "\xe2\x80\xa6" ^ String.sub srcset_value (end_pos - max_context) max_context 768 + else 769 + String.trim (String.sub srcset_value 0 end_pos) 741 770 with Not_found -> srcset_value 742 771 in 743 772 Message_collector.add_typed collector 744 - (Error_code.Bad_attr_value_generic { message = Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9csrcset\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected number followed by \xe2\x80\x9cw\xe2\x80\x9d or \xe2\x80\x9cx\xe2\x80\x9d but found \xe2\x80\x9c%s\xe2\x80\x9d at \xe2\x80\x9c%s\xe2\x80\x9d." srcset_value element_name trimmed_desc context }); 773 + (Error_code.Bad_attr_value_generic { message = Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9csrcset\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected number followed by \xe2\x80\x9cw\xe2\x80\x9d or \xe2\x80\x9cx\xe2\x80\x9d but found \xe2\x80\x9c%s\xe2\x80\x9d at \xe2\x80\x9c%s\xe2\x80\x9d." srcset_value element_name found_desc context }); 745 774 false 746 775 end 747 776 ··· 766 795 767 796 (** Parse and validate srcset attribute value *) 768 797 let validate_srcset value element_name has_sizes collector = 769 - let entries = split_on_comma_respecting_parens value in 798 + (* Srcset entries are split on commas - only balanced parentheses prevent split *) 799 + let entries = split_on_comma_balanced_parens value in 770 800 let has_w_descriptor = ref false in 771 801 let has_x_descriptor = ref false in 772 802 let no_descriptor_url = ref None in (* Track URL of first entry without width descriptor *)
+1 -1
lib/html5_checker/specialized/svg_checker.ml
··· 297 297 if value <> "http://www.w3.org/1999/xlink" then 298 298 Message_collector.add_typed collector 299 299 (Error_code.Bad_attr_value_generic { message = Printf.sprintf 300 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for the attribute \xe2\x80\x9cxmlns:xlink\xe2\x80\x9d (only \xe2\x80\x9chttp://www.w3.org/1999/xlink\xe2\x80\x9d permitted here)." 300 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for the attribute \xe2\x80\x9cxmlns:link\xe2\x80\x9d (only \xe2\x80\x9chttp://www.w3.org/1999/xlink\xe2\x80\x9d permitted here)." 301 301 value }) 302 302 | _ when String.starts_with ~prefix:"xmlns:" attr && attr <> "xmlns:xlink" -> 303 303 (* Other xmlns declarations are not allowed in HTML-embedded SVG *)