OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** Html5rw - Pure OCaml HTML5 Parser
7
8 This module provides a complete HTML5 parsing solution following the
9 WHATWG specification. It uses bytesrw for streaming input/output.
10
11 {2 Quick Start}
12
13 Parse HTML from a reader:
14 {[
15 open Bytesrw
16 let reader = Bytes.Reader.of_string "<p>Hello, world!</p>" in
17 let result = Html5rw.parse reader in
18 let html = Html5rw.to_string result
19 ]}
20
21 Parse from a file:
22 {[
23 open Bytesrw
24 let ic = open_in "page.html" in
25 let reader = Bytes.Reader.of_in_channel ic in
26 let result = Html5rw.parse reader in
27 close_in ic
28 ]}
29
30 Query with CSS selectors:
31 {[
32 let result = Html5rw.parse reader in
33 let divs = Html5rw.query result "div.content"
34 ]}
35*)
36
37(** {1 Error Handling} *)
38
39(** Global error type that wraps all errors raised by the Html5rw library.
40
41 This provides a unified error type for all parsing and selector errors,
42 along with printers for display and debugging.
43*)
44module Error = struct
45 (** The unified error type for the Html5rw library. *)
46 type t =
47 | Parse_error of {
48 code : Parse_error_code.t;
49 line : int;
50 column : int;
51 }
52 (** An HTML parse error, including location information. *)
53 | Selector_error of Selector.Error_code.t
54 (** A CSS selector parse error. *)
55
56 let of_parse_error (err : Parser.parse_error) : t =
57 Parse_error {
58 code = Parser.error_code err;
59 line = Parser.error_line err;
60 column = Parser.error_column err;
61 }
62
63 let of_selector_error (code : Selector.Error_code.t) : t =
64 Selector_error code
65
66 let to_string = function
67 | Parse_error { code; line; column } ->
68 Printf.sprintf "Parse error at %d:%d: %s" line column
69 (Parse_error_code.to_string code)
70 | Selector_error code ->
71 Printf.sprintf "Selector error: %s"
72 (Selector.Error_code.to_human_string code)
73
74 let pp fmt err = Format.pp_print_string fmt (to_string err)
75
76 (** Get the error code as a kebab-case string. *)
77 let code_string = function
78 | Parse_error { code; _ } -> Parse_error_code.to_string code
79 | Selector_error code -> Selector.Error_code.to_string code
80end
81
82(** {1 Sub-modules} *)
83
84(** Parse error code types *)
85module Parse_error_code = Parse_error_code
86
87(** DOM types and manipulation functions *)
88module Dom = Dom
89
90(** HTML5 tokenizer *)
91module Tokenizer = Tokenizer
92
93(** Encoding detection and decoding *)
94module Encoding = Encoding
95
96(** CSS selector engine *)
97module Selector = Selector
98
99(** HTML entity decoding *)
100module Entities = Entities
101
102(** Low-level parser access *)
103module Parser = Parser
104
105(** {1 Core Types} *)
106
107(** DOM node type. See {!Dom} for manipulation functions. *)
108type node = Dom.node
109
110let pp_node = Dom.pp
111
112(** Doctype information *)
113type doctype_data = Dom.doctype_data = {
114 name : string option;
115 public_id : string option;
116 system_id : string option;
117}
118
119let pp_doctype_data = Dom.pp_doctype_data
120
121(** Source location for nodes *)
122type location = Dom.location = {
123 line : int;
124 column : int;
125 end_line : int option;
126 end_column : int option;
127}
128
129let make_location = Dom.make_location
130let get_location = Dom.get_location
131let set_location = Dom.set_location
132
133(** Quirks mode as determined during parsing *)
134type quirks_mode = Dom.quirks_mode = No_quirks | Quirks | Limited_quirks
135
136let pp_quirks_mode = Dom.pp_quirks_mode
137
138(** Character encoding detected or specified *)
139type encoding = Encoding.encoding =
140 | Utf8
141 | Utf16le
142 | Utf16be
143 | Windows_1252
144 | Iso_8859_2
145 | Euc_jp
146
147let pp_encoding = Encoding.pp
148
149(** Parse error record *)
150type parse_error = Parser.parse_error
151
152(** Fragment parsing context *)
153type fragment_context = Parser.fragment_context
154
155(** Create a fragment parsing context.
156 @param tag_name Tag name of the context element
157 @param namespace Namespace (None for HTML, Some "svg", Some "mathml")
158*)
159let make_fragment_context = Parser.make_fragment_context
160
161(** Get the tag name from a fragment context *)
162let fragment_context_tag = Parser.fragment_context_tag
163
164(** Get the namespace from a fragment context *)
165let fragment_context_namespace = Parser.fragment_context_namespace
166
167let pp_fragment_context = Parser.pp_fragment_context
168
169(** Get the error code *)
170let error_code = Parser.error_code
171
172(** Get the line number of an error (1-indexed) *)
173let error_line = Parser.error_line
174
175(** Get the column number of an error (1-indexed) *)
176let error_column = Parser.error_column
177
178let pp_parse_error = Parser.pp_parse_error
179
180(** Result of parsing an HTML document *)
181type t = {
182 root : node;
183 errors : parse_error list;
184 encoding : encoding option;
185}
186
187let pp fmt t =
188 Format.fprintf fmt "{root=%a; errors=%d; encoding=%a}"
189 pp_node t.root
190 (List.length t.errors)
191 (Format.pp_print_option pp_encoding) t.encoding
192
193(* Internal: convert Parser.t to our t *)
194let of_parser_result (p : Parser.t) : t =
195 { root = Parser.root p; errors = Parser.errors p; encoding = Parser.encoding p }
196
197(** {1 Parsing Functions} *)
198
199(** Parse HTML from a [Bytes.Reader.t].
200
201 This is the primary parsing function. Create a reader from any source:
202 - [Bytes.Reader.of_string s] for strings
203 - [Bytes.Reader.of_in_channel ic] for files
204 - [Bytes.Reader.of_bytes b] for byte buffers
205
206 {[
207 open Bytesrw
208 let reader = Bytes.Reader.of_string "<html><body>Hello</body></html>" in
209 let result = Html5rw.parse reader
210 ]}
211
212 @param collect_errors If true, collect parse errors (default: false)
213 @param fragment_context Context element for fragment parsing
214*)
215let parse ?collect_errors ?fragment_context reader =
216 of_parser_result (Parser.parse ?collect_errors ?fragment_context reader)
217
218(** Parse raw bytes with automatic encoding detection.
219
220 This function implements the WHATWG encoding sniffing algorithm:
221 1. Check for BOM (Byte Order Mark)
222 2. Prescan for <meta charset>
223 3. Fall back to UTF-8
224
225 @param collect_errors If true, collect parse errors (default: false)
226 @param transport_encoding Encoding from HTTP Content-Type header
227 @param fragment_context Context element for fragment parsing
228*)
229let parse_bytes ?collect_errors ?transport_encoding ?fragment_context bytes =
230 of_parser_result (Parser.parse_bytes ?collect_errors ?transport_encoding ?fragment_context bytes)
231
232(** {1 Querying} *)
233
234(** Query the DOM tree with a CSS selector.
235
236 Supported selectors:
237 - Tag: [div], [p], [span]
238 - ID: [#myid]
239 - Class: [.myclass]
240 - Universal: [*]
241 - Attribute: [[attr]], [[attr="value"]], [[attr~="value"]], [[attr|="value"]]
242 - Pseudo-classes: [:first-child], [:last-child], [:nth-child(n)]
243 - Combinators: descendant (space), child (>), adjacent sibling (+), general sibling (~)
244
245 {[
246 let divs = Html5rw.query result "div.content > p"
247 ]}
248
249 @raise Selector.Selector_error if the selector is invalid
250*)
251let query t selector = Selector.query t.root selector
252
253(** Check if a node matches a CSS selector. *)
254let matches node selector = Selector.matches node selector
255
256(** {1 Serialization} *)
257
258(** Write the DOM tree to a [Bytes.Writer.t].
259
260 {[
261 open Bytesrw
262 let buf = Buffer.create 1024 in
263 let writer = Bytes.Writer.of_buffer buf in
264 Html5rw.to_writer result writer;
265 Bytes.Writer.write_eod writer;
266 let html = Buffer.contents buf
267 ]}
268
269 @param pretty If true, format with indentation (default: true)
270 @param indent_size Number of spaces per indent level (default: 2)
271*)
272let to_writer ?pretty ?indent_size t writer =
273 Dom.to_writer ?pretty ?indent_size writer t.root
274
275(** Serialize the DOM tree to a string.
276
277 Convenience function when the output fits in memory.
278
279 @param pretty If true, format with indentation (default: true)
280 @param indent_size Number of spaces per indent level (default: 2)
281*)
282let to_string ?pretty ?indent_size t = Dom.to_html ?pretty ?indent_size t.root
283
284(** Extract text content from the DOM tree.
285
286 @param separator String to insert between text nodes (default: " ")
287 @param strip If true, trim whitespace (default: true)
288*)
289let to_text ?separator ?strip t = Dom.to_text ?separator ?strip t.root
290
291(** Serialize to html5lib test format (for testing). *)
292let to_test_format t = Dom.to_test_format t.root
293
294(** {1 Result Accessors} *)
295
296(** Get the root node of the parsed document. *)
297let root t = t.root
298
299(** Get parse errors (if error collection was enabled). *)
300let errors t = t.errors
301
302(** Get the detected encoding (if parsed from bytes). *)
303let encoding t = t.encoding
304
305(** {1 DOM Utilities}
306
307 Common DOM operations are available directly. For the full API,
308 see the {!Dom} module.
309*)
310
311(** Create an element node.
312 @param namespace None for HTML, Some "svg" or Some "mathml" for foreign content
313 @param attrs List of (name, value) attribute pairs
314*)
315let create_element = Dom.create_element
316
317(** Create a text node. *)
318let create_text = Dom.create_text
319
320(** Create a comment node. *)
321let create_comment = Dom.create_comment
322
323(** Create an empty document node. *)
324let create_document = Dom.create_document
325
326(** Create a document fragment node. *)
327let create_document_fragment = Dom.create_document_fragment
328
329(** Create a doctype node. *)
330let create_doctype = Dom.create_doctype
331
332(** Append a child node to a parent. *)
333let append_child = Dom.append_child
334
335(** Insert a node before a reference node. *)
336let insert_before = Dom.insert_before
337
338(** Remove a child node from its parent. *)
339let remove_child = Dom.remove_child
340
341(** Get an attribute value. *)
342let get_attr = Dom.get_attr
343
344(** Set an attribute value. *)
345let set_attr = Dom.set_attr
346
347(** Check if a node has an attribute. *)
348let has_attr = Dom.has_attr
349
350(** Get all descendant nodes. *)
351let descendants = Dom.descendants
352
353(** Get all ancestor nodes (from parent to root). *)
354let ancestors = Dom.ancestors
355
356(** Get text content of a node and its descendants. *)
357let get_text_content = Dom.get_text_content
358
359(** Clone a node.
360 @param deep If true, also clone descendants (default: false)
361*)
362let clone = Dom.clone
363
364(** {1 Node Predicates} *)
365
366(** Test if a node is an element. *)
367let is_element = Dom.is_element
368
369(** Test if a node is a text node. *)
370let is_text = Dom.is_text
371
372(** Test if a node is a comment node. *)
373let is_comment = Dom.is_comment
374
375(** Test if a node is a document node. *)
376let is_document = Dom.is_document
377
378(** Test if a node is a document fragment. *)
379let is_document_fragment = Dom.is_document_fragment
380
381(** Test if a node is a doctype node. *)
382let is_doctype = Dom.is_doctype
383
384(** Test if a node has children. *)
385let has_children = Dom.has_children