OCaml HTML5 parser/serialiser based on Python's JustHTML

more

+1
lib/html5_checker/checker_registry.ml
··· 42 42 Hashtbl.replace reg "mime-type" Mime_type_checker.checker; 43 43 Hashtbl.replace reg "normalization" Normalization_checker.checker; 44 44 Hashtbl.replace reg "svg" Svg_checker.checker; 45 + Hashtbl.replace reg "xhtml-content" Xhtml_content_checker.checker; 45 46 (* Hashtbl.replace reg "heading" Heading_checker.checker; *) 46 47 (* Hashtbl.replace reg "content" Content_checker.checker; *) 47 48 reg
+1 -1
lib/html5_checker/dune
··· 3 3 (library 4 4 (name html5_checker) 5 5 (public_name html5rw.checker) 6 - (libraries html5rw jsont jsont.bytesrw astring str uunf uutf) 6 + (libraries html5rw jsont jsont.bytesrw astring str uunf uutf xmlm) 7 7 )
+31 -10
lib/html5_checker/html5_checker.ml
··· 19 19 } 20 20 21 21 let check ?(collect_parse_errors = true) ?system_id reader = 22 - let doc = Html5rw.parse ~collect_errors:collect_parse_errors reader in 23 22 let collector = Message_collector.create () in 24 23 25 - (* Add parse errors if collected *) 26 - if collect_parse_errors then begin 27 - let parse_errors = Parse_error_bridge.collect_parse_errors ?system_id doc in 28 - List.iter (Message_collector.add collector) parse_errors 29 - end; 24 + (* Check if this is an XHTML file - use XML parser if so *) 25 + if Xhtml_parser.is_xhtml_file system_id then begin 26 + (* Read all content for XHTML parsing *) 27 + let content = Bytesrw.Bytes.Reader.to_string reader in 28 + 29 + match Xhtml_parser.parse_xhtml content with 30 + | Ok root -> 31 + (* Run all registered checkers via DOM traversal *) 32 + let registry = Checker_registry.default () in 33 + Dom_walker.walk_registry registry collector root; 34 + let dummy_doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string "") in 35 + { doc = dummy_doc; msgs = Message_collector.messages collector; system_id } 36 + | Error msg -> 37 + Message_collector.add_error collector ~message:msg ~code:"xml-parse-error" (); 38 + let dummy_doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string "") in 39 + { doc = dummy_doc; msgs = Message_collector.messages collector; system_id } 40 + end 41 + else begin 42 + (* Standard HTML5 parsing *) 43 + let doc = Html5rw.parse ~collect_errors:collect_parse_errors reader in 30 44 31 - (* Run all registered checkers via DOM traversal *) 32 - let registry = Checker_registry.default () in 33 - Dom_walker.walk_registry registry collector (Html5rw.root doc); 45 + (* Add parse errors if collected *) 46 + if collect_parse_errors then begin 47 + let parse_errors = Parse_error_bridge.collect_parse_errors ?system_id doc in 48 + List.iter (Message_collector.add collector) parse_errors 49 + end; 34 50 35 - { doc; msgs = Message_collector.messages collector; system_id } 51 + (* Run all registered checkers via DOM traversal *) 52 + let registry = Checker_registry.default () in 53 + Dom_walker.walk_registry registry collector (Html5rw.root doc); 54 + 55 + { doc; msgs = Message_collector.messages collector; system_id } 56 + end 36 57 37 58 let check_dom ?(collect_parse_errors = true) ?system_id doc = 38 59 let collector = Message_collector.create () in
+68 -7
lib/html5_checker/semantic/required_attr_checker.ml
··· 3 3 type state = { 4 4 mutable _in_figure : bool; 5 5 (** Track if we're inside a <figure> element (alt is more critical there) *) 6 + mutable in_a_with_href : bool; 7 + (** Track if we're inside an <a> element with href attribute *) 6 8 } 7 9 8 - let create () = { _in_figure = false } 10 + let create () = { _in_figure = false; in_a_with_href = false } 9 11 10 - let reset state = state._in_figure <- false 12 + let reset state = 13 + state._in_figure <- false; 14 + state.in_a_with_href <- false 11 15 12 16 (** Check if an attribute list contains a specific attribute. *) 13 17 let has_attr name attrs = ··· 20 24 if String.equal attr_name name then Some value else None) 21 25 attrs 22 26 23 - let check_img_element attrs collector = 27 + let check_img_element state attrs collector = 24 28 (* Check for required src OR srcset attribute *) 25 29 if not (has_attr "src" attrs) && not (has_attr "srcset" attrs) then 26 30 Message_collector.add_error collector ··· 31 35 if not (has_attr "alt" attrs) then 32 36 Message_collector.add_error collector 33 37 ~message:"img element requires alt attribute for accessibility" 34 - ~code:"missing-required-attribute" ~element:"img" ~attribute:"alt" () 38 + ~code:"missing-required-attribute" ~element:"img" ~attribute:"alt" (); 39 + 40 + (* Check ismap requires 'a' ancestor with href *) 41 + if has_attr "ismap" attrs && not state.in_a_with_href then 42 + Message_collector.add_error collector 43 + ~message:"The \xe2\x80\x9cimg\xe2\x80\x9d element with the \xe2\x80\x9cismap\xe2\x80\x9d attribute set must have an \xe2\x80\x9ca\xe2\x80\x9d ancestor with the \xe2\x80\x9chref\xe2\x80\x9d attribute." 44 + ~code:"missing-required-ancestor" ~element:"img" ~attribute:"ismap" () 35 45 36 46 let check_area_element attrs collector = 37 47 (* area with href requires alt *) ··· 143 153 ~code:"bad-attribute-value" ~element:"div" ~attribute:"popover" () 144 154 | None -> () 145 155 156 + let check_meter_element attrs collector = 157 + (* meter requires value attribute *) 158 + if not (has_attr "value" attrs) then 159 + Message_collector.add_error collector 160 + ~message:"Element \xe2\x80\x9cmeter\xe2\x80\x9d is missing required attribute \xe2\x80\x9cvalue\xe2\x80\x9d." 161 + ~code:"missing-required-attribute" ~element:"meter" ~attribute:"value" () 162 + else begin 163 + (* Validate min <= value constraint *) 164 + match get_attr "value" attrs, get_attr "min" attrs with 165 + | Some value_str, Some min_str -> 166 + (try 167 + let value = float_of_string value_str in 168 + let min_val = float_of_string min_str in 169 + if min_val > value then 170 + Message_collector.add_error collector 171 + ~message:"The value of the \xe2\x80\x9cmin\xe2\x80\x9d attribute must be less than or equal to the value of the \xe2\x80\x9cvalue\xe2\x80\x9d attribute." 172 + ~code:"bad-attribute-value" ~element:"meter" ~attribute:"min" () 173 + with _ -> ()) 174 + | _ -> () 175 + end 176 + 177 + let check_progress_element attrs collector = 178 + (* Validate progress value constraints *) 179 + match get_attr "value" attrs with 180 + | None -> () (* value is optional *) 181 + | Some value_str -> 182 + (try 183 + let value = float_of_string value_str in 184 + let max_val = match get_attr "max" attrs with 185 + | None -> 1.0 (* default max is 1 *) 186 + | Some max_str -> (try float_of_string max_str with _ -> 1.0) 187 + in 188 + if value > max_val then 189 + (* Check which message to use based on whether max is present *) 190 + if has_attr "max" attrs then 191 + Message_collector.add_error collector 192 + ~message:"The value of the \xe2\x80\x9cvalue\xe2\x80\x9d attribute must be less than or equal to the value of the \xe2\x80\x9cmax\xe2\x80\x9d attribute." 193 + ~code:"bad-attribute-value" ~element:"progress" ~attribute:"value" () 194 + else 195 + Message_collector.add_error collector 196 + ~message:"The value of the \xe2\x80\x9cvalue\xe2\x80\x9d attribute must be less than or equal to one when the \xe2\x80\x9cmax\xe2\x80\x9d attribute is absent." 197 + ~code:"bad-attribute-value" ~element:"progress" ~attribute:"value" () 198 + with _ -> ()) 199 + 146 200 let start_element state ~name ~namespace:_ ~attrs collector = 147 201 match name with 148 - | "img" -> check_img_element attrs collector 202 + | "img" -> check_img_element state attrs collector 149 203 | "area" -> check_area_element attrs collector 150 204 | "input" -> check_input_element attrs collector 151 205 | "script" -> check_script_element attrs collector 152 206 | "meta" -> check_meta_element attrs collector 153 207 | "link" -> check_link_element attrs collector 154 - | "a" -> check_a_element attrs collector 208 + | "a" -> 209 + check_a_element attrs collector; 210 + if has_attr "href" attrs then state.in_a_with_href <- true 155 211 | "map" -> check_map_element attrs collector 156 212 | "object" -> check_object_element attrs collector 213 + | "meter" -> check_meter_element attrs collector 214 + | "progress" -> check_progress_element attrs collector 157 215 | "figure" -> state._in_figure <- true 158 216 | _ -> 159 217 (* Check popover attribute on any element *) 160 218 if has_attr "popover" attrs then check_popover_element attrs collector 161 219 162 220 let end_element state ~name ~namespace:_ _collector = 163 - match name with "figure" -> state._in_figure <- false | _ -> () 221 + match name with 222 + | "figure" -> state._in_figure <- false 223 + | "a" -> state.in_a_with_href <- false 224 + | _ -> () 164 225 165 226 let characters _state _text _collector = () 166 227
+103
lib/html5_checker/specialized/xhtml_content_checker.ml
··· 1 + (** XHTML content model checker. 2 + 3 + Validates specific content model rules that the Nu validator checks, 4 + particularly for elements that don't allow text content or specific children. *) 5 + 6 + type state = { 7 + mutable element_stack : string list; 8 + } 9 + 10 + let create () = { element_stack = [] } 11 + 12 + let reset state = state.element_stack <- [] 13 + 14 + (* Elements that don't allow direct text content (only specific child elements) *) 15 + let no_text_elements = [ 16 + "menu"; (* Only li elements *) 17 + "iframe"; (* In XHTML mode, no content allowed *) 18 + "figure"; (* Only figcaption and flow content, not bare text *) 19 + ] 20 + 21 + 22 + (* Check if an element is allowed as child of parent *) 23 + let is_child_allowed ~parent ~child = 24 + match parent with 25 + | "menu" -> 26 + (* menu only allows li, script, template *) 27 + List.mem child ["li"; "script"; "template"] 28 + | _ -> true 29 + 30 + (* Check if text is allowed in element *) 31 + let is_text_allowed element = 32 + not (List.mem element no_text_elements) 33 + 34 + (* Check if data-* attribute has uppercase characters *) 35 + let check_data_attr_case attrs collector = 36 + List.iter (fun (attr_name, _) -> 37 + if String.length attr_name > 5 && 38 + String.sub attr_name 0 5 = "data-" then 39 + let suffix = String.sub attr_name 5 (String.length attr_name - 5) in 40 + if String.exists (fun c -> c >= 'A' && c <= 'Z') suffix then 41 + Message_collector.add_error collector 42 + ~message:"\xe2\x80\x9cdata-*\xe2\x80\x9d attributes must not have characters from the range \xe2\x80\x9cA\xe2\x80\x9d\xe2\x80\xa6\xe2\x80\x9cZ\xe2\x80\x9d in the name." 43 + ~attribute:attr_name 44 + () 45 + ) attrs 46 + 47 + let start_element state ~name ~namespace ~attrs collector = 48 + ignore namespace; 49 + let name_lower = String.lowercase_ascii name in 50 + 51 + (* Check data-* attributes for uppercase *) 52 + check_data_attr_case attrs collector; 53 + 54 + (* Check if this element is allowed as child of parent *) 55 + (match state.element_stack with 56 + | parent :: _ -> 57 + let parent_lower = String.lowercase_ascii parent in 58 + if not (is_child_allowed ~parent:parent_lower ~child:name_lower) then 59 + Message_collector.add_error collector 60 + ~message:(Printf.sprintf 61 + "Element \xe2\x80\x9c%s\xe2\x80\x9d not allowed as child of element \xe2\x80\x9c%s\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)" 62 + name_lower parent_lower) 63 + ~element:name_lower 64 + () 65 + | [] -> ()); 66 + 67 + (* Push onto stack *) 68 + state.element_stack <- name :: state.element_stack 69 + 70 + let end_element state ~name:_ ~namespace:_ _collector = 71 + (* Pop from stack *) 72 + match state.element_stack with 73 + | _ :: rest -> state.element_stack <- rest 74 + | [] -> () 75 + 76 + let characters state text collector = 77 + (* Check if text is allowed in current element *) 78 + match state.element_stack with 79 + | [] -> () (* Root level - ignore *) 80 + | parent :: _ -> 81 + let parent_lower = String.lowercase_ascii parent in 82 + (* Only report non-whitespace text *) 83 + let trimmed = String.trim text in 84 + if trimmed <> "" && not (is_text_allowed parent_lower) then 85 + Message_collector.add_error collector 86 + ~message:(Printf.sprintf 87 + "Text not allowed in element \xe2\x80\x9c%s\xe2\x80\x9d in this context." 88 + parent_lower) 89 + ~element:parent_lower 90 + () 91 + 92 + let end_document _state _collector = () 93 + 94 + let checker = 95 + (module struct 96 + type nonrec state = state 97 + let create = create 98 + let reset = reset 99 + let start_element = start_element 100 + let end_element = end_element 101 + let characters = characters 102 + let end_document = end_document 103 + end : Checker.S)
+5
lib/html5_checker/specialized/xhtml_content_checker.mli
··· 1 + (** XHTML content model checker. 2 + 3 + Validates specific content model rules for XHTML. *) 4 + 5 + val checker : Checker.t
+85
lib/html5_checker/xhtml_parser.ml
··· 1 + (** XHTML parser using xmlm for proper XML parsing. 2 + 3 + This module provides XML parsing for XHTML files, which the HTML5 parser 4 + cannot handle correctly (especially self-closing tags on non-void elements). *) 5 + 6 + (** Parse XHTML content using xmlm and return a DOM tree. *) 7 + let parse_xhtml content = 8 + let input = Xmlm.make_input (`String (0, content)) in 9 + 10 + (* Stack of nodes during parsing *) 11 + let stack = ref [] in 12 + let root = Html5rw.Dom.create_document () in 13 + stack := [root]; 14 + 15 + (* Helper to get namespace shorthand *) 16 + let ns_shorthand ns = 17 + if ns = "http://www.w3.org/2000/svg" then Some "svg" 18 + else if ns = "http://www.w3.org/1998/Math/MathML" then Some "mathml" 19 + else if ns = "http://www.w3.org/1999/xhtml" then None (* HTML namespace *) 20 + else if ns = "" then None (* No namespace = HTML *) 21 + else Some ns (* Keep other namespaces as-is *) 22 + in 23 + 24 + (* Process xmlm signals *) 25 + let rec process () = 26 + if Xmlm.eoi input then () 27 + else begin 28 + match Xmlm.input input with 29 + | `Dtd _ -> 30 + (* Skip DTD for now *) 31 + process () 32 + | `El_start ((ns, local), attrs) -> 33 + (* Create element node *) 34 + let namespace = ns_shorthand ns in 35 + let attr_list = List.map (fun ((_, aname), aval) -> (aname, aval)) attrs in 36 + let node = Html5rw.Dom.create_element local ~namespace ~attrs:attr_list () in 37 + (* Append to current parent *) 38 + (match !stack with 39 + | parent :: _ -> Html5rw.Dom.append_child parent node 40 + | [] -> ()); 41 + (* Push onto stack *) 42 + stack := node :: !stack; 43 + process () 44 + | `El_end -> 45 + (* Pop from stack *) 46 + (match !stack with 47 + | _ :: rest -> stack := rest 48 + | [] -> ()); 49 + process () 50 + | `Data text -> 51 + (* Create text node and append to current parent *) 52 + let trimmed = String.trim text in 53 + if trimmed <> "" || String.length text > 0 then begin 54 + let text_node = Html5rw.Dom.create_text text in 55 + (match !stack with 56 + | parent :: _ -> Html5rw.Dom.append_child parent text_node 57 + | [] -> ()) 58 + end; 59 + process () 60 + end 61 + in 62 + 63 + try 64 + process (); 65 + Ok root 66 + with 67 + | Xmlm.Error ((line, col), err) -> 68 + Error (Printf.sprintf "XML parse error at %d:%d: %s" line col (Xmlm.error_message err)) 69 + 70 + (** Check if a system_id indicates an XHTML file. *) 71 + let is_xhtml_file system_id = 72 + match system_id with 73 + | Some path -> 74 + String.length path > 6 && 75 + String.sub path (String.length path - 6) 6 = ".xhtml" 76 + | None -> false 77 + 78 + (** Wrap DOM in an Html5rw.t-compatible structure for the checker. *) 79 + type xhtml_doc = { 80 + root : Html5rw.Dom.node; 81 + errors : Html5rw.Error.t list; 82 + } 83 + 84 + let xhtml_root doc = doc.root 85 + let xhtml_errors _doc = [] (* XML parser handles errors differently *)
+2 -2
test/debug_check.ml
··· 1 1 let () = 2 - let test_file = "validator/tests/xhtml/elements/progress/002-isvalid.xhtml" in 2 + let test_file = "validator/tests/xhtml/elements/menu/menu-containing-text-novalid.xhtml" in 3 3 let ic = open_in test_file in 4 4 let html = really_input_string ic (in_channel_length ic) in 5 5 close_in ic; ··· 32 32 print_endline "=== Errors ==="; 33 33 List.iter (fun e -> print_endline e.Html5_checker.Message.message) errors; 34 34 print_endline "\n=== Expected ==="; 35 - print_endline "Element \xe2\x80\x9crect\xe2\x80\x9d is missing required attribute \xe2\x80\x9cheight\xe2\x80\x9d." 35 + print_endline "Text not allowed in element \xe2\x80\x9cmenu\xe2\x80\x9d in this context."