OCaml HTML5 parser/serialiser based on Python's JustHTML

more

Changed files
+82 -10
lib
+19 -2
lib/html5_checker/specialized/aria_checker.ml
··· 425 | parent :: _ -> Some parent.element_name 426 | [] -> None 427 428 (** Render a list of roles as a human-readable string. *) 429 let render_role_set roles = 430 match roles with ··· 548 end 549 end; 550 551 - (* Check li role restrictions in menu/menubar/tablist contexts *) 552 if name_lower = "li" && explicit_roles <> [] then begin 553 let first_role = List.hd explicit_roles in 554 (* none/presentation are always allowed as they remove from accessibility tree *) ··· 565 | Some _ -> 566 if first_role <> "tab" then 567 Message_collector.add_typed collector Error_code.Li_bad_role_in_tablist 568 - | None -> ()) 569 end 570 end; 571
··· 425 | parent :: _ -> Some parent.element_name 426 | [] -> None 427 428 + (** Check if an li element is in a "list context": 429 + - Inside a ul/ol/menu element with no explicit role, OR 430 + - Inside any element with role=list *) 431 + let is_in_list_context state = 432 + List.exists (fun ancestor -> 433 + (* Check for role=list on any ancestor *) 434 + if List.mem "list" ancestor.explicit_roles then true 435 + (* Check for ul/ol/menu with no explicit role *) 436 + else match ancestor.element_name with 437 + | "ul" | "ol" | "menu" -> ancestor.explicit_roles = [] 438 + | _ -> false 439 + ) state.stack 440 + 441 (** Render a list of roles as a human-readable string. *) 442 let render_role_set roles = 443 match roles with ··· 561 end 562 end; 563 564 + (* Check li role restrictions in menu/menubar/tablist/list contexts *) 565 if name_lower = "li" && explicit_roles <> [] then begin 566 let first_role = List.hd explicit_roles in 567 (* none/presentation are always allowed as they remove from accessibility tree *) ··· 578 | Some _ -> 579 if first_role <> "tab" then 580 Message_collector.add_typed collector Error_code.Li_bad_role_in_tablist 581 + | None -> 582 + (* Check if in list context (ul/ol/menu without explicit role, or role=list) *) 583 + if is_in_list_context state then 584 + if first_role <> "listitem" then 585 + Message_collector.add_typed collector Error_code.Li_bad_role_in_list) 586 end 587 end; 588
+26 -1
lib/html5_checker/specialized/normalization_checker.ml
··· 17 let normalized = normalize_nfc text in 18 text = normalized 19 20 let start_element _state ~name:_ ~namespace:_ ~attrs:_ _collector = () 21 22 let end_element _state ~name:_ ~namespace:_ _collector = () ··· 27 if String.length text_trimmed = 0 then () 28 else if not (is_nfc text_trimmed) then begin 29 let normalized = normalize_nfc text_trimmed in 30 Message_collector.add_typed collector 31 - (Error_code.Not_nfc { replacement = normalized }) 32 end 33 34 let end_document _state _collector = ()
··· 17 let normalized = normalize_nfc text in 18 text = normalized 19 20 + (** Check if a character is ASCII punctuation *) 21 + let is_ascii_punct c = 22 + let code = Char.code c in 23 + (code >= 0x21 && code <= 0x2F) || (* ! to / *) 24 + (code >= 0x3A && code <= 0x40) || (* : to @ including ? *) 25 + (code >= 0x5B && code <= 0x60) || (* [ to ` *) 26 + (code >= 0x7B && code <= 0x7E) (* { to ~ *) 27 + 28 + (** Strip trailing ASCII punctuation but keep trailing space if present before punct *) 29 + let strip_trailing_punct s = 30 + let len = String.length s in 31 + if len = 0 then s 32 + else 33 + (* Find the last non-ASCII-punct character *) 34 + let rec find_end i = 35 + if i < 0 then 0 36 + else if not (is_ascii_punct s.[i]) then i + 1 37 + else find_end (i - 1) 38 + in 39 + let end_pos = find_end (len - 1) in 40 + if end_pos = len then s 41 + else String.sub s 0 end_pos 42 + 43 let start_element _state ~name:_ ~namespace:_ ~attrs:_ _collector = () 44 45 let end_element _state ~name:_ ~namespace:_ _collector = () ··· 50 if String.length text_trimmed = 0 then () 51 else if not (is_nfc text_trimmed) then begin 52 let normalized = normalize_nfc text_trimmed in 53 + (* Strip trailing ASCII punctuation from replacement to match Nu validator *) 54 + let replacement = strip_trailing_punct normalized in 55 Message_collector.add_typed collector 56 + (Error_code.Not_nfc { replacement }) 57 end 58 59 let end_document _state _collector = ()
+36 -6
lib/html5_checker/specialized/srcset_sizes_checker.ml
··· 47 (** Split string on commas while respecting parentheses *) 48 let split_on_comma_respecting_parens s = split_respecting_parens ~sep:',' s 49 50 (** Split string on spaces while respecting parentheses, filtering empty segments *) 51 let split_on_space_respecting_parens s = 52 split_respecting_parens ~sep:' ' s |> List.filter (fun s -> s <> "") ··· 730 | _ -> 731 (* Unknown descriptor - find context in srcset_value *) 732 let trimmed_desc = String.trim desc in 733 - (* Try to find the context: find where this descriptor appears in srcset_value *) 734 let context = 735 try 736 let pos = Str.search_forward (Str.regexp_string trimmed_desc) srcset_value 0 in 737 - (* Get the context up to and including the descriptor and the comma after *) 738 let end_pos = min (pos + String.length trimmed_desc + 1) (String.length srcset_value) in 739 - let start_pos = max 0 (pos - 2) in 740 - String.trim (String.sub srcset_value start_pos (end_pos - start_pos)) 741 with Not_found -> srcset_value 742 in 743 Message_collector.add_typed collector 744 - (Error_code.Bad_attr_value_generic { message = Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9csrcset\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected number followed by \xe2\x80\x9cw\xe2\x80\x9d or \xe2\x80\x9cx\xe2\x80\x9d but found \xe2\x80\x9c%s\xe2\x80\x9d at \xe2\x80\x9c%s\xe2\x80\x9d." srcset_value element_name trimmed_desc context }); 745 false 746 end 747 ··· 766 767 (** Parse and validate srcset attribute value *) 768 let validate_srcset value element_name has_sizes collector = 769 - let entries = split_on_comma_respecting_parens value in 770 let has_w_descriptor = ref false in 771 let has_x_descriptor = ref false in 772 let no_descriptor_url = ref None in (* Track URL of first entry without width descriptor *)
··· 47 (** Split string on commas while respecting parentheses *) 48 let split_on_comma_respecting_parens s = split_respecting_parens ~sep:',' s 49 50 + (** Split on commas respecting BALANCED parentheses only (for srcset). 51 + If parens are unbalanced overall, just split on all commas. *) 52 + let split_on_comma_balanced_parens s = 53 + (* First, check if parens are balanced overall *) 54 + let opens = ref 0 and closes = ref 0 in 55 + String.iter (fun c -> if c = '(' then incr opens else if c = ')' then incr closes) s; 56 + if !opens <> !closes then 57 + (* Unbalanced parens - just split on all commas *) 58 + String.split_on_char ',' s 59 + else 60 + (* Balanced parens - respect them during split *) 61 + split_on_comma_respecting_parens s 62 + 63 (** Split string on spaces while respecting parentheses, filtering empty segments *) 64 let split_on_space_respecting_parens s = 65 split_respecting_parens ~sep:' ' s |> List.filter (fun s -> s <> "") ··· 743 | _ -> 744 (* Unknown descriptor - find context in srcset_value *) 745 let trimmed_desc = String.trim desc in 746 + (* Nu validator adds extra ')' after the last ')' if descriptor contains any '(' *) 747 + let found_desc = 748 + if String.contains trimmed_desc '(' then 749 + (* Find position of last ')' and insert extra ')' after it *) 750 + try 751 + let last_close = String.rindex trimmed_desc ')' in 752 + let before = String.sub trimmed_desc 0 (last_close + 1) in 753 + let after = String.sub trimmed_desc (last_close + 1) (String.length trimmed_desc - last_close - 1) in 754 + before ^ ")" ^ after 755 + with Not_found -> trimmed_desc ^ ")" 756 + else trimmed_desc 757 + in 758 + (* Try to find the context: show trailing portion ending with descriptor and comma *) 759 let context = 760 try 761 let pos = Str.search_forward (Str.regexp_string trimmed_desc) srcset_value 0 in 762 + (* Get the context ending with the descriptor and the comma after *) 763 let end_pos = min (pos + String.length trimmed_desc + 1) (String.length srcset_value) in 764 + (* Show trailing portion with ellipsis if needed *) 765 + let max_context = 15 in 766 + if end_pos > max_context then 767 + "\xe2\x80\xa6" ^ String.sub srcset_value (end_pos - max_context) max_context 768 + else 769 + String.trim (String.sub srcset_value 0 end_pos) 770 with Not_found -> srcset_value 771 in 772 Message_collector.add_typed collector 773 + (Error_code.Bad_attr_value_generic { message = Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9csrcset\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected number followed by \xe2\x80\x9cw\xe2\x80\x9d or \xe2\x80\x9cx\xe2\x80\x9d but found \xe2\x80\x9c%s\xe2\x80\x9d at \xe2\x80\x9c%s\xe2\x80\x9d." srcset_value element_name found_desc context }); 774 false 775 end 776 ··· 795 796 (** Parse and validate srcset attribute value *) 797 let validate_srcset value element_name has_sizes collector = 798 + (* Srcset entries are split on commas - only balanced parentheses prevent split *) 799 + let entries = split_on_comma_balanced_parens value in 800 let has_w_descriptor = ref false in 801 let has_x_descriptor = ref false in 802 let no_descriptor_url = ref None in (* Track URL of first entry without width descriptor *)
+1 -1
lib/html5_checker/specialized/svg_checker.ml
··· 297 if value <> "http://www.w3.org/1999/xlink" then 298 Message_collector.add_typed collector 299 (Error_code.Bad_attr_value_generic { message = Printf.sprintf 300 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for the attribute \xe2\x80\x9cxmlns:xlink\xe2\x80\x9d (only \xe2\x80\x9chttp://www.w3.org/1999/xlink\xe2\x80\x9d permitted here)." 301 value }) 302 | _ when String.starts_with ~prefix:"xmlns:" attr && attr <> "xmlns:xlink" -> 303 (* Other xmlns declarations are not allowed in HTML-embedded SVG *)
··· 297 if value <> "http://www.w3.org/1999/xlink" then 298 Message_collector.add_typed collector 299 (Error_code.Bad_attr_value_generic { message = Printf.sprintf 300 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for the attribute \xe2\x80\x9cxmlns:link\xe2\x80\x9d (only \xe2\x80\x9chttp://www.w3.org/1999/xlink\xe2\x80\x9d permitted here)." 301 value }) 302 | _ when String.starts_with ~prefix:"xmlns:" attr && attr <> "xmlns:xlink" -> 303 (* Other xmlns declarations are not allowed in HTML-embedded SVG *)