OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 4.1 kB view raw
1(** DOM tree traversal for HTML5 conformance checking. *) 2 3(** Convert DOM location to Message location. *) 4let dom_location_to_message_location (loc : Html5rw.Dom.location) : Message.location = 5 Message.make_location 6 ~line:loc.line 7 ~column:loc.column 8 ?end_line:loc.end_line 9 ?end_column:loc.end_column 10 () 11 12(** Get Message.location from a DOM node. *) 13let node_location (node : Html5rw.Dom.node) : Message.location option = 14 Option.map dom_location_to_message_location node.location 15 16(** Package a checker with its state for traversal. *) 17type checker_state = { 18 start_element : element:Element.t -> Message_collector.t -> unit; 19 end_element : tag:Tag.element_tag -> Message_collector.t -> unit; 20 characters : string -> Message_collector.t -> unit; 21 end_document : Message_collector.t -> unit; 22} 23 24(** Create a checker state package from a first-class module. *) 25let make_checker_state (module C : Checker.S) = 26 let state = C.create () in 27 { 28 start_element = (fun ~element collector -> 29 C.start_element state ~element collector); 30 end_element = (fun ~tag collector -> 31 C.end_element state ~tag collector); 32 characters = (fun text collector -> 33 C.characters state text collector); 34 end_document = (fun collector -> 35 C.end_document state collector); 36 } 37 38(** Walk a DOM node with a single checker state. *) 39let rec walk_node_single cs collector node = 40 let open Html5rw.Dom in 41 (* Set current location for messages *) 42 Message_collector.set_current_location collector (node_location node); 43 match node.name with 44 | "#text" -> 45 (* Text node: emit characters event *) 46 cs.characters node.data collector 47 | "#comment" -> 48 (* Comment node: skip - comment content is not text content *) 49 () 50 | "#document" | "#document-fragment" -> 51 (* Document/fragment nodes: just traverse children *) 52 List.iter (walk_node_single cs collector) node.children 53 | "!doctype" -> 54 (* Doctype node: skip (no validation events for doctype) *) 55 () 56 | _ -> 57 (* Element node: create typed element, emit start, traverse children, emit end *) 58 let element = Element.create ~name:node.name ~namespace:node.namespace ~attrs:node.attrs in 59 cs.start_element ~element collector; 60 List.iter (walk_node_single cs collector) node.children; 61 cs.end_element ~tag:element.tag collector 62 63let walk checker collector node = 64 let cs = make_checker_state checker in 65 walk_node_single cs collector node; 66 cs.end_document collector 67 68(** Walk a DOM node with multiple checker states. *) 69let rec walk_node_all css collector node = 70 let open Html5rw.Dom in 71 (* Set current location for messages *) 72 Message_collector.set_current_location collector (node_location node); 73 match node.name with 74 | "#text" -> 75 (* Text node: emit characters event to all checkers *) 76 List.iter (fun cs -> cs.characters node.data collector) css 77 | "#comment" -> 78 (* Comment node: skip - comment content is not text content *) 79 () 80 | "#document" | "#document-fragment" -> 81 (* Document/fragment nodes: just traverse children *) 82 List.iter (walk_node_all css collector) node.children 83 | "!doctype" -> 84 (* Doctype node: skip *) 85 () 86 | _ -> 87 (* Element node: create typed element, emit start to all checkers, traverse children, emit end to all *) 88 let element = Element.create ~name:node.name ~namespace:node.namespace ~attrs:node.attrs in 89 List.iter (fun cs -> cs.start_element ~element collector) css; 90 List.iter (walk_node_all css collector) node.children; 91 List.iter (fun cs -> cs.end_element ~tag:element.tag collector) css 92 93let walk_all checkers collector node = 94 (* Create checker state packages *) 95 let css = List.map make_checker_state checkers in 96 (* Traverse with all checkers *) 97 walk_node_all css collector node; 98 (* Call end_document on all checkers *) 99 List.iter (fun cs -> cs.end_document collector) css 100 101let walk_registry registry collector node = 102 let checkers = Checker_registry.all registry in 103 walk_all checkers collector node