OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at 5fafb7bdddbb5b5a195339d9af8baef4a8513e71 111 lines 4.2 kB view raw
1(** DOM tree traversal for HTML5 conformance checking. *) 2 3(** Convert DOM location to Message location. *) 4let dom_location_to_message_location (loc : Html5rw.Dom.location) : Message.location = 5 Message.make_location 6 ~line:loc.line 7 ~column:loc.column 8 ?end_line:loc.end_line 9 ?end_column:loc.end_column 10 () 11 12(** Get Message.location from a DOM node. *) 13let node_location (node : Html5rw.Dom.node) : Message.location option = 14 Option.map dom_location_to_message_location node.location 15 16(** Package a checker with its state for traversal. *) 17type checker_state = { 18 start_element : 19 name:string -> 20 namespace:string option -> 21 attrs:(string * string) list -> 22 Message_collector.t -> 23 unit; 24 end_element : 25 name:string -> namespace:string option -> Message_collector.t -> unit; 26 characters : string -> Message_collector.t -> unit; 27 end_document : Message_collector.t -> unit; 28} 29 30(** Create a checker state package from a first-class module. *) 31let make_checker_state (module C : Checker.S) = 32 let state = C.create () in 33 { 34 start_element = (fun ~name ~namespace ~attrs collector -> 35 C.start_element state ~name ~namespace ~attrs collector); 36 end_element = (fun ~name ~namespace collector -> 37 C.end_element state ~name ~namespace collector); 38 characters = (fun text collector -> 39 C.characters state text collector); 40 end_document = (fun collector -> 41 C.end_document state collector); 42 } 43 44(** Walk a DOM node with a single checker state. *) 45let rec walk_node_single cs collector node = 46 let open Html5rw.Dom in 47 (* Set current location for messages *) 48 Message_collector.set_current_location collector (node_location node); 49 match node.name with 50 | "#text" -> 51 (* Text node: emit characters event *) 52 cs.characters node.data collector 53 | "#comment" -> 54 (* Comment node: skip - comment content is not text content *) 55 () 56 | "#document" | "#document-fragment" -> 57 (* Document/fragment nodes: just traverse children *) 58 List.iter (walk_node_single cs collector) node.children 59 | "!doctype" -> 60 (* Doctype node: skip (no validation events for doctype) *) 61 () 62 | _ -> 63 (* Element node: emit start, traverse children, emit end *) 64 cs.start_element ~name:node.name ~namespace:node.namespace ~attrs:node.attrs collector; 65 List.iter (walk_node_single cs collector) node.children; 66 cs.end_element ~name:node.name ~namespace:node.namespace collector 67 68let walk checker collector node = 69 let cs = make_checker_state checker in 70 walk_node_single cs collector node; 71 cs.end_document collector 72 73(** Walk a DOM node with multiple checker states. *) 74let rec walk_node_all css collector node = 75 let open Html5rw.Dom in 76 (* Set current location for messages *) 77 Message_collector.set_current_location collector (node_location node); 78 match node.name with 79 | "#text" -> 80 (* Text node: emit characters event to all checkers *) 81 List.iter (fun cs -> cs.characters node.data collector) css 82 | "#comment" -> 83 (* Comment node: skip - comment content is not text content *) 84 () 85 | "#document" | "#document-fragment" -> 86 (* Document/fragment nodes: just traverse children *) 87 List.iter (walk_node_all css collector) node.children 88 | "!doctype" -> 89 (* Doctype node: skip *) 90 () 91 | _ -> 92 (* Element node: emit start to all checkers, traverse children, emit end to all *) 93 List.iter (fun cs -> 94 cs.start_element ~name:node.name ~namespace:node.namespace ~attrs:node.attrs collector 95 ) css; 96 List.iter (walk_node_all css collector) node.children; 97 List.iter (fun cs -> 98 cs.end_element ~name:node.name ~namespace:node.namespace collector 99 ) css 100 101let walk_all checkers collector node = 102 (* Create checker state packages *) 103 let css = List.map make_checker_state checkers in 104 (* Traverse with all checkers *) 105 walk_node_all css collector node; 106 (* Call end_document on all checkers *) 107 List.iter (fun cs -> cs.end_document collector) css 108 109let walk_registry registry collector node = 110 let checkers = Checker_registry.all registry in 111 walk_all checkers collector node