OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** DOM tree traversal for HTML5 conformance checking. *)
2
3(** Package a checker with its state for traversal. *)
4type checker_state = {
5 start_element :
6 name:string ->
7 namespace:string option ->
8 attrs:(string * string) list ->
9 Message_collector.t ->
10 unit;
11 end_element :
12 name:string -> namespace:string option -> Message_collector.t -> unit;
13 characters : string -> Message_collector.t -> unit;
14 end_document : Message_collector.t -> unit;
15}
16
17(** Create a checker state package from a first-class module. *)
18let make_checker_state (module C : Checker.S) =
19 let state = C.create () in
20 {
21 start_element = (fun ~name ~namespace ~attrs collector ->
22 C.start_element state ~name ~namespace ~attrs collector);
23 end_element = (fun ~name ~namespace collector ->
24 C.end_element state ~name ~namespace collector);
25 characters = (fun text collector ->
26 C.characters state text collector);
27 end_document = (fun collector ->
28 C.end_document state collector);
29 }
30
31(** Walk a DOM node with a single checker state. *)
32let rec walk_node_single cs collector node =
33 let open Html5rw.Dom in
34 match node.name with
35 | "#text" ->
36 (* Text node: emit characters event *)
37 cs.characters node.data collector
38 | "#comment" ->
39 (* Comment node: skip - comment content is not text content *)
40 ()
41 | "#document" | "#document-fragment" ->
42 (* Document/fragment nodes: just traverse children *)
43 List.iter (walk_node_single cs collector) node.children
44 | "!doctype" ->
45 (* Doctype node: skip (no validation events for doctype) *)
46 ()
47 | _ ->
48 (* Element node: emit start, traverse children, emit end *)
49 cs.start_element ~name:node.name ~namespace:node.namespace ~attrs:node.attrs collector;
50 List.iter (walk_node_single cs collector) node.children;
51 cs.end_element ~name:node.name ~namespace:node.namespace collector
52
53let walk checker collector node =
54 let cs = make_checker_state checker in
55 walk_node_single cs collector node;
56 cs.end_document collector
57
58(** Walk a DOM node with multiple checker states. *)
59let rec walk_node_all css collector node =
60 let open Html5rw.Dom in
61 match node.name with
62 | "#text" ->
63 (* Text node: emit characters event to all checkers *)
64 List.iter (fun cs -> cs.characters node.data collector) css
65 | "#comment" ->
66 (* Comment node: skip - comment content is not text content *)
67 ()
68 | "#document" | "#document-fragment" ->
69 (* Document/fragment nodes: just traverse children *)
70 List.iter (walk_node_all css collector) node.children
71 | "!doctype" ->
72 (* Doctype node: skip *)
73 ()
74 | _ ->
75 (* Element node: emit start to all checkers, traverse children, emit end to all *)
76 List.iter (fun cs ->
77 cs.start_element ~name:node.name ~namespace:node.namespace ~attrs:node.attrs collector
78 ) css;
79 List.iter (walk_node_all css collector) node.children;
80 List.iter (fun cs ->
81 cs.end_element ~name:node.name ~namespace:node.namespace collector
82 ) css
83
84let walk_all checkers collector node =
85 (* Create checker state packages *)
86 let css = List.map make_checker_state checkers in
87 (* Traverse with all checkers *)
88 walk_node_all css collector node;
89 (* Call end_document on all checkers *)
90 List.iter (fun cs -> cs.end_document collector) css
91
92let walk_registry registry collector node =
93 let checkers = Checker_registry.all registry in
94 walk_all checkers collector node