OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** DOM tree traversal for HTML5 conformance checking. *)
2
3(** Convert DOM location to Message location. *)
4let dom_location_to_message_location (loc : Html5rw.Dom.location) : Message.location =
5 Message.make_location
6 ~line:loc.line
7 ~column:loc.column
8 ?end_line:loc.end_line
9 ?end_column:loc.end_column
10 ()
11
12(** Get Message.location from a DOM node. *)
13let node_location (node : Html5rw.Dom.node) : Message.location option =
14 Option.map dom_location_to_message_location node.location
15
16(** Package a checker with its state for traversal. *)
17type checker_state = {
18 start_element :
19 name:string ->
20 namespace:string option ->
21 attrs:(string * string) list ->
22 Message_collector.t ->
23 unit;
24 end_element :
25 name:string -> namespace:string option -> Message_collector.t -> unit;
26 characters : string -> Message_collector.t -> unit;
27 end_document : Message_collector.t -> unit;
28}
29
30(** Create a checker state package from a first-class module. *)
31let make_checker_state (module C : Checker.S) =
32 let state = C.create () in
33 {
34 start_element = (fun ~name ~namespace ~attrs collector ->
35 C.start_element state ~name ~namespace ~attrs collector);
36 end_element = (fun ~name ~namespace collector ->
37 C.end_element state ~name ~namespace collector);
38 characters = (fun text collector ->
39 C.characters state text collector);
40 end_document = (fun collector ->
41 C.end_document state collector);
42 }
43
44(** Walk a DOM node with a single checker state. *)
45let rec walk_node_single cs collector node =
46 let open Html5rw.Dom in
47 (* Set current location for messages *)
48 Message_collector.set_current_location collector (node_location node);
49 match node.name with
50 | "#text" ->
51 (* Text node: emit characters event *)
52 cs.characters node.data collector
53 | "#comment" ->
54 (* Comment node: skip - comment content is not text content *)
55 ()
56 | "#document" | "#document-fragment" ->
57 (* Document/fragment nodes: just traverse children *)
58 List.iter (walk_node_single cs collector) node.children
59 | "!doctype" ->
60 (* Doctype node: skip (no validation events for doctype) *)
61 ()
62 | _ ->
63 (* Element node: emit start, traverse children, emit end *)
64 cs.start_element ~name:node.name ~namespace:node.namespace ~attrs:node.attrs collector;
65 List.iter (walk_node_single cs collector) node.children;
66 cs.end_element ~name:node.name ~namespace:node.namespace collector
67
68let walk checker collector node =
69 let cs = make_checker_state checker in
70 walk_node_single cs collector node;
71 cs.end_document collector
72
73(** Walk a DOM node with multiple checker states. *)
74let rec walk_node_all css collector node =
75 let open Html5rw.Dom in
76 (* Set current location for messages *)
77 Message_collector.set_current_location collector (node_location node);
78 match node.name with
79 | "#text" ->
80 (* Text node: emit characters event to all checkers *)
81 List.iter (fun cs -> cs.characters node.data collector) css
82 | "#comment" ->
83 (* Comment node: skip - comment content is not text content *)
84 ()
85 | "#document" | "#document-fragment" ->
86 (* Document/fragment nodes: just traverse children *)
87 List.iter (walk_node_all css collector) node.children
88 | "!doctype" ->
89 (* Doctype node: skip *)
90 ()
91 | _ ->
92 (* Element node: emit start to all checkers, traverse children, emit end to all *)
93 List.iter (fun cs ->
94 cs.start_element ~name:node.name ~namespace:node.namespace ~attrs:node.attrs collector
95 ) css;
96 List.iter (walk_node_all css collector) node.children;
97 List.iter (fun cs ->
98 cs.end_element ~name:node.name ~namespace:node.namespace collector
99 ) css
100
101let walk_all checkers collector node =
102 (* Create checker state packages *)
103 let css = List.map make_checker_state checkers in
104 (* Traverse with all checkers *)
105 walk_node_all css collector node;
106 (* Call end_document on all checkers *)
107 List.iter (fun cs -> cs.end_document collector) css
108
109let walk_registry registry collector node =
110 let checkers = Checker_registry.all registry in
111 walk_all checkers collector node