OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** DOM tree traversal for HTML5 conformance checking.
2
3 This module provides functions to traverse DOM trees and apply checkers
4 to validate HTML5 documents. It implements a depth-first, in-order
5 traversal that visits every node in the tree and notifies checkers
6 of traversal events.
7
8 {2 Traversal Model}
9
10 The walker follows a SAX-like event model, emitting events as it
11 encounters different node types during traversal:
12
13 {v
14 Document
15 └── html (start_element "html")
16 ├── head (start_element "head")
17 │ └── title (start_element "title")
18 │ ├── #text "Page Title" (characters)
19 │ └── (end_element "title")
20 └── body (start_element "body")
21 └── p (start_element "p")
22 ├── #text "Hello " (characters)
23 ├── b (start_element "b")
24 │ ├── #text "world" (characters)
25 │ └── (end_element "b")
26 ├── #text "!" (characters)
27 └── (end_element "p")
28 end_document
29 v}
30
31 {2 Event Sequence}
32
33 For each element node:
34 1. {!Checker.S.start_element} is called when entering the element
35 2. Children are recursively traversed
36 3. {!Checker.S.end_element} is called when exiting the element
37
38 For text and comment nodes:
39 - {!Checker.S.characters} is called with the text content
40
41 After the entire tree is traversed:
42 - {!Checker.S.end_document} is called on all checkers
43
44 {2 Checker Coordination}
45
46 When multiple checkers are used:
47 - All checkers receive the same event sequence
48 - Events are delivered to checkers in the order they appear in the list
49 - Each checker maintains independent state
50 - Messages from all checkers are collected together
51
52 This allows composing orthogonal validation rules without interference.
53
54 {2 Usage Examples}
55
56 {b Single checker:}
57 {[
58 let checker = Checker.noop () in
59 let collector = Message_collector.create () in
60 walk checker collector dom;
61 let messages = Message_collector.messages collector in
62 List.iter Message.pp messages
63 ]}
64
65 {b Multiple checkers:}
66 {[
67 let checkers = [checker1; checker2; checker3] in
68 let collector = Message_collector.create () in
69 walk_all checkers collector dom;
70 (* Analyze messages from all checkers *)
71 ]}
72
73 {b Registry of checkers:}
74 {[
75 let registry = Checker_registry.default () in
76 let collector = Message_collector.create () in
77 walk_registry registry collector dom;
78 (* All registered checkers have validated the DOM *)
79 ]} *)
80
81(** {1 Single Checker Traversal} *)
82
83val walk : Checker.t -> Message_collector.t -> Html5rw.Dom.node -> unit
84(** [walk checker collector node] traverses a DOM tree with a single checker.
85
86 @param checker The checker to apply during traversal
87 @param collector The message collector for validation messages
88 @param node The root node to start traversal from
89
90 The traversal is depth-first and in-order: for each element, the
91 checker receives a {!Checker.S.start_element} event, then children
92 are recursively traversed, then an {!Checker.S.end_element} event
93 is emitted.
94
95 After the entire tree is traversed, {!Checker.S.end_document} is
96 called to allow the checker to emit any final validation messages.
97
98 {b Example:}
99 {[
100 (* Validate a parsed HTML document *)
101 let checker = Checker.noop () in
102 let collector = Message_collector.create () in
103 walk checker collector document_node;
104
105 (* Check for errors *)
106 let messages = Message_collector.messages collector in
107 let errors = List.filter
108 (fun msg -> msg.Message.severity = Message.Error)
109 messages in
110 if errors <> [] then
111 Printf.printf "Found %d errors\n" (List.length errors)
112 ]}
113
114 {b Notes:}
115 - Only element nodes trigger start/end events
116 - Text and comment nodes trigger character events
117 - Document and doctype nodes are silently skipped
118 - The traversal follows document order (parent before children,
119 earlier siblings before later ones) *)
120
121(** {1 Multiple Checker Traversal} *)
122
123val walk_all :
124 Checker.t list -> Message_collector.t -> Html5rw.Dom.node -> unit
125(** [walk_all checkers collector node] traverses a DOM tree with multiple
126 checkers.
127
128 @param checkers List of checkers to apply during traversal
129 @param collector The message collector for validation messages
130 @param node The root node to start traversal from
131
132 This performs a single tree traversal, delivering each event to all
133 checkers in sequence. This is more efficient than calling {!walk}
134 multiple times.
135
136 All checkers receive events in the order they appear in the list.
137 Each checker maintains independent state, so validation rules can
138 be composed without interference.
139
140 {b Example:}
141 {[
142 (* Run multiple validation passes in one traversal *)
143 let structure_checker = (module StructureChecker : Checker.S) in
144 let attribute_checker = (module AttributeChecker : Checker.S) in
145 let obsolete_checker = (module ObsoleteChecker : Checker.S) in
146
147 let checkers = [structure_checker; attribute_checker; obsolete_checker] in
148 let collector = Message_collector.create () in
149
150 walk_all checkers collector document_node;
151
152 (* All three checkers have validated the document *)
153 let messages = Message_collector.messages collector in
154 Message_format.print_messages messages
155 ]}
156
157 {b Empty list behavior:}
158 If the checkers list is empty, the tree is traversed but no validation
159 is performed. This is equivalent to calling [walk (Checker.noop ()) ...]. *)
160
161(** {1 Registry-Based Traversal} *)
162
163val walk_registry :
164 Checker_registry.t -> Message_collector.t -> Html5rw.Dom.node -> unit
165(** [walk_registry registry collector node] traverses a DOM tree with all
166 checkers from a registry.
167
168 @param registry The registry containing checkers to apply
169 @param collector The message collector for validation messages
170 @param node The root node to start traversal from
171
172 This is equivalent to:
173 {[
174 let checkers = Checker_registry.all registry in
175 walk_all checkers collector node
176 ]}
177
178 Use this when you want to run a pre-configured set of checkers
179 without manually extracting them from the registry.
180
181 {b Example:}
182 {[
183 (* Set up registry with desired checkers *)
184 let registry = Checker_registry.default () in
185 Checker_registry.register registry "custom" my_checker;
186
187 (* Validate multiple documents with same checker set *)
188 List.iter (fun doc ->
189 let collector = Message_collector.create () in
190 walk_registry registry collector doc;
191 report_results collector
192 ) documents
193 ]}
194
195 {b Empty registry behavior:}
196 If the registry is empty, the tree is traversed but no validation
197 is performed. *)