OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** Base checker module for HTML5 conformance checking.
2
3 This module provides the core checker abstraction used throughout the
4 html5_checker library. A checker validates HTML5 documents by observing
5 DOM tree traversal events and emitting validation messages.
6
7 {2 Design Overview}
8
9 Checkers follow a SAX-like event model where they receive notifications
10 about elements, text, and document boundaries as a DOM tree is traversed.
11 This design allows for:
12
13 - {b Stateful validation}: Each checker maintains its own state across
14 multiple events
15 - {b Composability}: Multiple checkers can validate the same document
16 simultaneously
17 - {b Efficiency}: DOM traversal happens once regardless of checker count
18
19 {2 Checker Lifecycle}
20
21 A checker progresses through these phases:
22
23 1. {b Creation}: Initialize with {!create} to set up initial state
24 2. {b Traversal}: Receive {!start_element}, {!characters}, and
25 {!end_element} events as the DOM is walked
26 3. {b Completion}: Finalize validation with {!end_document}
27 4. {b Reset} (optional): Return to initial state with {!reset}
28
29 {3 Event Sequence}
30
31 For a document like [<p>Hello <b>world</b></p>], the event sequence is:
32
33 {v
34 start_element "p"
35 characters "Hello "
36 start_element "b"
37 characters "world"
38 end_element "b"
39 end_element "p"
40 end_document
41 v}
42
43 {2 First-Class Modules}
44
45 Checkers are represented as first-class modules implementing the {!S}
46 signature. This allows:
47
48 - Dynamic checker registration and discovery
49 - Heterogeneous collections of checkers
50 - Checker selection at runtime based on validation requirements
51
52 @see <https://v2.ocaml.org/manual/firstclassmodules.html>
53 OCaml manual: First-class modules
54*)
55
56(** {1 Module Signature} *)
57
58(** The signature that all checker modules must implement.
59
60 A checker module maintains validation state and receives notifications
61 about DOM tree traversal events. *)
62module type S = sig
63 (** The type of checker state.
64
65 This is an abstract type that holds the checker's internal validation
66 state. Different checkers will have different state representations
67 depending on what they need to track during validation. *)
68 type state
69
70 (** {1 Lifecycle Operations} *)
71
72 val create : unit -> state
73 (** [create ()] initializes a new checker state.
74
75 This function sets up the initial state needed for validation,
76 such as empty stacks for context tracking, counters, or lookup
77 tables. *)
78
79 val reset : state -> unit
80 (** [reset state] resets the checker to its initial state.
81
82 This allows reusing a checker for multiple documents without
83 reallocating. After reset, the checker behaves as if freshly
84 created with {!create}. *)
85
86 (** {1 DOM Traversal Events} *)
87
88 val start_element :
89 state ->
90 element:Element.t ->
91 Message_collector.t ->
92 unit
93 (** [start_element state ~element collector] is called when
94 entering an element during DOM traversal.
95
96 @param state The checker state
97 @param element The typed element (includes tag, typed attrs, and raw attrs)
98 @param collector The message collector for emitting validation messages
99
100 This is where checkers can validate:
101 - Whether the element is allowed in the current context
102 - Whether required attributes are present
103 - Whether attribute values are valid
104 - Whether the element opens a new validation context *)
105
106 val end_element :
107 state ->
108 tag:Tag.element_tag ->
109 Message_collector.t ->
110 unit
111 (** [end_element state ~tag collector] is called when exiting
112 an element during DOM traversal.
113
114 @param state The checker state
115 @param tag The element tag
116 @param collector The message collector for emitting validation messages
117
118 This is where checkers can:
119 - Pop validation contexts from stacks
120 - Validate that required child elements were present
121 - Emit messages about element-scoped validation rules *)
122
123 val characters : state -> string -> Message_collector.t -> unit
124 (** [characters state text collector] is called when text content is
125 encountered during DOM traversal.
126
127 @param state The checker state
128 @param text The text content
129 @param collector The message collector for emitting validation messages
130
131 This is where checkers can validate:
132 - Whether text is allowed in the current context
133 - Whether text content follows specific patterns
134 - Whether text matches expected formats *)
135
136 val end_document : state -> Message_collector.t -> unit
137 (** [end_document state collector] is called after the entire DOM tree has
138 been traversed.
139
140 @param state The checker state
141 @param collector The message collector for emitting validation messages
142
143 This is where checkers can:
144 - Emit messages about missing required elements
145 - Validate document-level constraints
146 - Check that all opened contexts were properly closed
147 - Report any accumulated validation failures *)
148end
149
150(** {1 Checker Values} *)
151
152(** The type of a checker value.
153
154 This is a packed first-class module containing both the checker
155 implementation and its state. It enables storing heterogeneous
156 checkers in collections and passing them around dynamically. *)
157type t = (module S)
158
159(** {1 Built-in Checkers} *)
160
161val noop : unit -> t
162(** [noop ()] creates a no-operation checker that performs no validation.
163
164 This checker ignores all events and never emits messages. It is useful:
165 - As a placeholder in checker registries
166 - For testing checker infrastructure
167 - As a base for building new checkers
168
169 {b Example:}
170 {[
171 let checker = noop () in
172 (* Does nothing when walked over a DOM tree *)
173 ]}
174*)
175
176(** {1 Checker Construction Helpers} *)
177
178(** Input signature for {!Make} functor.
179
180 Only the required callbacks need to be provided. Optional callbacks
181 (characters, end_document) default to no-op implementations. *)
182module type Input = sig
183 type state
184 val create : unit -> state
185 val reset : state -> unit
186 val start_element : state -> element:Element.t -> Message_collector.t -> unit
187 val end_element : state -> tag:Tag.element_tag -> Message_collector.t -> unit
188
189 (** Optional: called for text content. Default: no-op. *)
190 val characters : (state -> string -> Message_collector.t -> unit) option
191
192 (** Optional: called at document end. Default: no-op. *)
193 val end_document : (state -> Message_collector.t -> unit) option
194end
195
196(** Functor to create a checker from an {!Input} module.
197
198 This reduces boilerplate when creating checkers that don't need
199 to handle all events. The characters and end_document callbacks
200 default to no-ops if not provided.
201
202 {b Example:}
203 {[
204 let checker = Checker.Make(struct
205 type state = { mutable count : int }
206 let create () = { count = 0 }
207 let reset s = s.count <- 0
208 let start_element s ~element collector =
209 s.count <- s.count + 1
210 let end_element _ ~tag:_ _ = ()
211 let characters = None (* Use default no-op *)
212 let end_document = None (* Use default no-op *)
213 end)
214 ]}
215*)
216module Make : functor (I : Input) -> S with type state = I.state
217
218(** Create a checker from individual callback functions.
219
220 This is a simpler alternative to the [Make] functor that eliminates the
221 need for a module wrapper at the end of each checker file.
222
223 {b Example:}
224 {[
225 let checker = Checker.make
226 ~create:(fun () -> { count = 0 })
227 ~reset:(fun s -> s.count <- 0)
228 ~start_element:(fun s ~element collector -> ...)
229 ~end_element:(fun s ~tag collector -> ...)
230 ()
231 ]}
232
233 @param create State initialization function
234 @param reset State reset function
235 @param start_element Element start callback
236 @param end_element Element end callback
237 @param characters Optional text content callback (default: no-op)
238 @param end_document Optional document end callback (default: no-op)
239*)
240val make :
241 create:(unit -> 's) ->
242 reset:('s -> unit) ->
243 start_element:('s -> element:Element.t -> Message_collector.t -> unit) ->
244 end_element:('s -> tag:Tag.element_tag -> Message_collector.t -> unit) ->
245 ?characters:('s -> string -> Message_collector.t -> unit) ->
246 ?end_document:('s -> Message_collector.t -> unit) ->
247 unit ->
248 t