OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** DOM tree traversal for HTML5 conformance checking. *) 2 3(** Package a checker with its state for traversal. *) 4type checker_state = { 5 start_element : 6 name:string -> 7 namespace:string option -> 8 attrs:(string * string) list -> 9 Message_collector.t -> 10 unit; 11 end_element : 12 name:string -> namespace:string option -> Message_collector.t -> unit; 13 characters : string -> Message_collector.t -> unit; 14 end_document : Message_collector.t -> unit; 15} 16 17(** Create a checker state package from a first-class module. *) 18let make_checker_state (module C : Checker.S) = 19 let state = C.create () in 20 { 21 start_element = (fun ~name ~namespace ~attrs collector -> 22 C.start_element state ~name ~namespace ~attrs collector); 23 end_element = (fun ~name ~namespace collector -> 24 C.end_element state ~name ~namespace collector); 25 characters = (fun text collector -> 26 C.characters state text collector); 27 end_document = (fun collector -> 28 C.end_document state collector); 29 } 30 31(** Walk a DOM node with a single checker state. *) 32let rec walk_node_single cs collector node = 33 let open Html5rw.Dom in 34 match node.name with 35 | "#text" -> 36 (* Text node: emit characters event *) 37 cs.characters node.data collector 38 | "#comment" -> 39 (* Comment node: skip - comment content is not text content *) 40 () 41 | "#document" | "#document-fragment" -> 42 (* Document/fragment nodes: just traverse children *) 43 List.iter (walk_node_single cs collector) node.children 44 | "!doctype" -> 45 (* Doctype node: skip (no validation events for doctype) *) 46 () 47 | _ -> 48 (* Element node: emit start, traverse children, emit end *) 49 cs.start_element ~name:node.name ~namespace:node.namespace ~attrs:node.attrs collector; 50 List.iter (walk_node_single cs collector) node.children; 51 cs.end_element ~name:node.name ~namespace:node.namespace collector 52 53let walk checker collector node = 54 let cs = make_checker_state checker in 55 walk_node_single cs collector node; 56 cs.end_document collector 57 58(** Walk a DOM node with multiple checker states. *) 59let rec walk_node_all css collector node = 60 let open Html5rw.Dom in 61 match node.name with 62 | "#text" -> 63 (* Text node: emit characters event to all checkers *) 64 List.iter (fun cs -> cs.characters node.data collector) css 65 | "#comment" -> 66 (* Comment node: skip - comment content is not text content *) 67 () 68 | "#document" | "#document-fragment" -> 69 (* Document/fragment nodes: just traverse children *) 70 List.iter (walk_node_all css collector) node.children 71 | "!doctype" -> 72 (* Doctype node: skip *) 73 () 74 | _ -> 75 (* Element node: emit start to all checkers, traverse children, emit end to all *) 76 List.iter (fun cs -> 77 cs.start_element ~name:node.name ~namespace:node.namespace ~attrs:node.attrs collector 78 ) css; 79 List.iter (walk_node_all css collector) node.children; 80 List.iter (fun cs -> 81 cs.end_element ~name:node.name ~namespace:node.namespace collector 82 ) css 83 84let walk_all checkers collector node = 85 (* Create checker state packages *) 86 let css = List.map make_checker_state checkers in 87 (* Traverse with all checkers *) 88 walk_node_all css collector node; 89 (* Call end_document on all checkers *) 90 List.iter (fun cs -> cs.end_document collector) css 91 92let walk_registry registry collector node = 93 let checkers = Checker_registry.all registry in 94 walk_all checkers collector node