OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** DOM tree traversal for HTML5 conformance checking. *)
2
3(** Convert DOM location to Message location. *)
4let dom_location_to_message_location (loc : Html5rw.Dom.location) : Message.location =
5 Message.make_location
6 ~line:loc.line
7 ~column:loc.column
8 ?end_line:loc.end_line
9 ?end_column:loc.end_column
10 ()
11
12(** Get Message.location from a DOM node. *)
13let node_location (node : Html5rw.Dom.node) : Message.location option =
14 Option.map dom_location_to_message_location node.location
15
16(** Package a checker with its state for traversal. *)
17type checker_state = {
18 start_element : element:Element.t -> Message_collector.t -> unit;
19 end_element : tag:Tag.element_tag -> Message_collector.t -> unit;
20 characters : string -> Message_collector.t -> unit;
21 end_document : Message_collector.t -> unit;
22}
23
24(** Create a checker state package from a first-class module. *)
25let make_checker_state (module C : Checker.S) =
26 let state = C.create () in
27 {
28 start_element = (fun ~element collector ->
29 C.start_element state ~element collector);
30 end_element = (fun ~tag collector ->
31 C.end_element state ~tag collector);
32 characters = (fun text collector ->
33 C.characters state text collector);
34 end_document = (fun collector ->
35 C.end_document state collector);
36 }
37
38(** Walk a DOM node with a single checker state. *)
39let rec walk_node_single cs collector node =
40 let open Html5rw.Dom in
41 (* Set current location for messages *)
42 Message_collector.set_current_location collector (node_location node);
43 match node.name with
44 | "#text" ->
45 (* Text node: emit characters event *)
46 cs.characters node.data collector
47 | "#comment" ->
48 (* Comment node: skip - comment content is not text content *)
49 ()
50 | "#document" | "#document-fragment" ->
51 (* Document/fragment nodes: just traverse children *)
52 List.iter (walk_node_single cs collector) node.children
53 | "!doctype" ->
54 (* Doctype node: skip (no validation events for doctype) *)
55 ()
56 | _ ->
57 (* Element node: create typed element, emit start, traverse children, emit end *)
58 let element = Element.create ~name:node.name ~namespace:node.namespace ~attrs:node.attrs in
59 cs.start_element ~element collector;
60 List.iter (walk_node_single cs collector) node.children;
61 cs.end_element ~tag:element.tag collector
62
63let walk checker collector node =
64 let cs = make_checker_state checker in
65 walk_node_single cs collector node;
66 cs.end_document collector
67
68(** Walk a DOM node with multiple checker states. *)
69let rec walk_node_all css collector node =
70 let open Html5rw.Dom in
71 (* Set current location for messages *)
72 Message_collector.set_current_location collector (node_location node);
73 match node.name with
74 | "#text" ->
75 (* Text node: emit characters event to all checkers *)
76 List.iter (fun cs -> cs.characters node.data collector) css
77 | "#comment" ->
78 (* Comment node: skip - comment content is not text content *)
79 ()
80 | "#document" | "#document-fragment" ->
81 (* Document/fragment nodes: just traverse children *)
82 List.iter (walk_node_all css collector) node.children
83 | "!doctype" ->
84 (* Doctype node: skip *)
85 ()
86 | _ ->
87 (* Element node: create typed element, emit start to all checkers, traverse children, emit end to all *)
88 let element = Element.create ~name:node.name ~namespace:node.namespace ~attrs:node.attrs in
89 List.iter (fun cs -> cs.start_element ~element collector) css;
90 List.iter (walk_node_all css collector) node.children;
91 List.iter (fun cs -> cs.end_element ~tag:element.tag collector) css
92
93let walk_all checkers collector node =
94 (* Create checker state packages *)
95 let css = List.map make_checker_state checkers in
96 (* Traverse with all checkers *)
97 walk_node_all css collector node;
98 (* Call end_document on all checkers *)
99 List.iter (fun cs -> cs.end_document collector) css
100
101let walk_registry registry collector node =
102 let checkers = Checker_registry.all registry in
103 walk_all checkers collector node