OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** Html5rw - Pure OCaml HTML5 Parser
7
8 This module provides a complete HTML5 parsing solution following the
9 WHATWG specification. It uses bytesrw for streaming input/output.
10
11 {2 Quick Start}
12
13 Parse HTML from a reader:
14 {[
15 open Bytesrw
16 let reader = Bytes.Reader.of_string "<p>Hello, world!</p>" in
17 let result = Html5rw.parse reader in
18 let html = Html5rw.to_string result
19 ]}
20
21 Parse from a file:
22 {[
23 open Bytesrw
24 let ic = open_in "page.html" in
25 let reader = Bytes.Reader.of_in_channel ic in
26 let result = Html5rw.parse reader in
27 close_in ic
28 ]}
29
30 Query with CSS selectors:
31 {[
32 let result = Html5rw.parse reader in
33 let divs = Html5rw.query result "div.content"
34 ]}
35*)
36
37(** {1 Error Handling} *)
38
39(** Global error type that wraps all errors raised by the Html5rw library.
40
41 This provides a unified error type for all parsing and selector errors,
42 along with printers for display and debugging.
43*)
44module Error = struct
45 (** The unified error type for the Html5rw library. *)
46 type t =
47 | Parse_error of {
48 code : Parse_error_code.t;
49 line : int;
50 column : int;
51 }
52 (** An HTML parse error, including location information. *)
53 | Selector_error of Selector.Error_code.t
54 (** A CSS selector parse error. *)
55
56 let of_parse_error (err : Parser.parse_error) : t =
57 Parse_error {
58 code = Parser.error_code err;
59 line = Parser.error_line err;
60 column = Parser.error_column err;
61 }
62
63 let of_selector_error (code : Selector.Error_code.t) : t =
64 Selector_error code
65
66 let to_string = function
67 | Parse_error { code; line; column } ->
68 Printf.sprintf "Parse error at %d:%d: %s" line column
69 (Parse_error_code.to_string code)
70 | Selector_error code ->
71 Printf.sprintf "Selector error: %s"
72 (Selector.Error_code.to_human_string code)
73
74 let pp fmt err = Format.pp_print_string fmt (to_string err)
75
76 (** Get the error code as a kebab-case string. *)
77 let code_string = function
78 | Parse_error { code; _ } -> Parse_error_code.to_string code
79 | Selector_error code -> Selector.Error_code.to_string code
80end
81
82(** {1 Sub-modules} *)
83
84(** Parse error code types *)
85module Parse_error_code = Parse_error_code
86
87(** DOM types and manipulation functions *)
88module Dom = Dom
89
90(** HTML5 tokenizer *)
91module Tokenizer = Tokenizer
92
93(** Encoding detection and decoding *)
94module Encoding = Encoding
95
96(** CSS selector engine *)
97module Selector = Selector
98
99(** HTML entity decoding *)
100module Entities = Entities
101
102(** Low-level parser access *)
103module Parser = Parser
104
105(** {1 Core Types} *)
106
107(** DOM node type. See {!Dom} for manipulation functions. *)
108type node = Dom.node
109
110let pp_node = Dom.pp
111
112(** Doctype information *)
113type doctype_data = Dom.doctype_data = {
114 name : string option;
115 public_id : string option;
116 system_id : string option;
117}
118
119let pp_doctype_data = Dom.pp_doctype_data
120
121(** Quirks mode as determined during parsing *)
122type quirks_mode = Dom.quirks_mode = No_quirks | Quirks | Limited_quirks
123
124let pp_quirks_mode = Dom.pp_quirks_mode
125
126(** Character encoding detected or specified *)
127type encoding = Encoding.encoding =
128 | Utf8
129 | Utf16le
130 | Utf16be
131 | Windows_1252
132 | Iso_8859_2
133 | Euc_jp
134
135let pp_encoding = Encoding.pp
136
137(** Parse error record *)
138type parse_error = Parser.parse_error
139
140(** Fragment parsing context *)
141type fragment_context = Parser.fragment_context
142
143(** Create a fragment parsing context.
144 @param tag_name Tag name of the context element
145 @param namespace Namespace (None for HTML, Some "svg", Some "mathml")
146*)
147let make_fragment_context = Parser.make_fragment_context
148
149(** Get the tag name from a fragment context *)
150let fragment_context_tag = Parser.fragment_context_tag
151
152(** Get the namespace from a fragment context *)
153let fragment_context_namespace = Parser.fragment_context_namespace
154
155let pp_fragment_context = Parser.pp_fragment_context
156
157(** Get the error code *)
158let error_code = Parser.error_code
159
160(** Get the line number of an error (1-indexed) *)
161let error_line = Parser.error_line
162
163(** Get the column number of an error (1-indexed) *)
164let error_column = Parser.error_column
165
166let pp_parse_error = Parser.pp_parse_error
167
168(** Result of parsing an HTML document *)
169type t = {
170 root : node;
171 errors : parse_error list;
172 encoding : encoding option;
173}
174
175let pp fmt t =
176 Format.fprintf fmt "{root=%a; errors=%d; encoding=%a}"
177 pp_node t.root
178 (List.length t.errors)
179 (Format.pp_print_option pp_encoding) t.encoding
180
181(* Internal: convert Parser.t to our t *)
182let of_parser_result (p : Parser.t) : t =
183 { root = Parser.root p; errors = Parser.errors p; encoding = Parser.encoding p }
184
185(** {1 Parsing Functions} *)
186
187(** Parse HTML from a [Bytes.Reader.t].
188
189 This is the primary parsing function. Create a reader from any source:
190 - [Bytes.Reader.of_string s] for strings
191 - [Bytes.Reader.of_in_channel ic] for files
192 - [Bytes.Reader.of_bytes b] for byte buffers
193
194 {[
195 open Bytesrw
196 let reader = Bytes.Reader.of_string "<html><body>Hello</body></html>" in
197 let result = Html5rw.parse reader
198 ]}
199
200 @param collect_errors If true, collect parse errors (default: false)
201 @param fragment_context Context element for fragment parsing
202*)
203let parse ?collect_errors ?fragment_context reader =
204 of_parser_result (Parser.parse ?collect_errors ?fragment_context reader)
205
206(** Parse raw bytes with automatic encoding detection.
207
208 This function implements the WHATWG encoding sniffing algorithm:
209 1. Check for BOM (Byte Order Mark)
210 2. Prescan for <meta charset>
211 3. Fall back to UTF-8
212
213 @param collect_errors If true, collect parse errors (default: false)
214 @param transport_encoding Encoding from HTTP Content-Type header
215 @param fragment_context Context element for fragment parsing
216*)
217let parse_bytes ?collect_errors ?transport_encoding ?fragment_context bytes =
218 of_parser_result (Parser.parse_bytes ?collect_errors ?transport_encoding ?fragment_context bytes)
219
220(** {1 Querying} *)
221
222(** Query the DOM tree with a CSS selector.
223
224 Supported selectors:
225 - Tag: [div], [p], [span]
226 - ID: [#myid]
227 - Class: [.myclass]
228 - Universal: [*]
229 - Attribute: [[attr]], [[attr="value"]], [[attr~="value"]], [[attr|="value"]]
230 - Pseudo-classes: [:first-child], [:last-child], [:nth-child(n)]
231 - Combinators: descendant (space), child (>), adjacent sibling (+), general sibling (~)
232
233 {[
234 let divs = Html5rw.query result "div.content > p"
235 ]}
236
237 @raise Selector.Selector_error if the selector is invalid
238*)
239let query t selector = Selector.query t.root selector
240
241(** Check if a node matches a CSS selector. *)
242let matches node selector = Selector.matches node selector
243
244(** {1 Serialization} *)
245
246(** Write the DOM tree to a [Bytes.Writer.t].
247
248 {[
249 open Bytesrw
250 let buf = Buffer.create 1024 in
251 let writer = Bytes.Writer.of_buffer buf in
252 Html5rw.to_writer result writer;
253 Bytes.Writer.write_eod writer;
254 let html = Buffer.contents buf
255 ]}
256
257 @param pretty If true, format with indentation (default: true)
258 @param indent_size Number of spaces per indent level (default: 2)
259*)
260let to_writer ?pretty ?indent_size t writer =
261 Dom.to_writer ?pretty ?indent_size writer t.root
262
263(** Serialize the DOM tree to a string.
264
265 Convenience function when the output fits in memory.
266
267 @param pretty If true, format with indentation (default: true)
268 @param indent_size Number of spaces per indent level (default: 2)
269*)
270let to_string ?pretty ?indent_size t = Dom.to_html ?pretty ?indent_size t.root
271
272(** Extract text content from the DOM tree.
273
274 @param separator String to insert between text nodes (default: " ")
275 @param strip If true, trim whitespace (default: true)
276*)
277let to_text ?separator ?strip t = Dom.to_text ?separator ?strip t.root
278
279(** Serialize to html5lib test format (for testing). *)
280let to_test_format t = Dom.to_test_format t.root
281
282(** {1 Result Accessors} *)
283
284(** Get the root node of the parsed document. *)
285let root t = t.root
286
287(** Get parse errors (if error collection was enabled). *)
288let errors t = t.errors
289
290(** Get the detected encoding (if parsed from bytes). *)
291let encoding t = t.encoding
292
293(** {1 DOM Utilities}
294
295 Common DOM operations are available directly. For the full API,
296 see the {!Dom} module.
297*)
298
299(** Create an element node.
300 @param namespace None for HTML, Some "svg" or Some "mathml" for foreign content
301 @param attrs List of (name, value) attribute pairs
302*)
303let create_element = Dom.create_element
304
305(** Create a text node. *)
306let create_text = Dom.create_text
307
308(** Create a comment node. *)
309let create_comment = Dom.create_comment
310
311(** Create an empty document node. *)
312let create_document = Dom.create_document
313
314(** Create a document fragment node. *)
315let create_document_fragment = Dom.create_document_fragment
316
317(** Create a doctype node. *)
318let create_doctype = Dom.create_doctype
319
320(** Append a child node to a parent. *)
321let append_child = Dom.append_child
322
323(** Insert a node before a reference node. *)
324let insert_before = Dom.insert_before
325
326(** Remove a child node from its parent. *)
327let remove_child = Dom.remove_child
328
329(** Get an attribute value. *)
330let get_attr = Dom.get_attr
331
332(** Set an attribute value. *)
333let set_attr = Dom.set_attr
334
335(** Check if a node has an attribute. *)
336let has_attr = Dom.has_attr
337
338(** Get all descendant nodes. *)
339let descendants = Dom.descendants
340
341(** Get all ancestor nodes (from parent to root). *)
342let ancestors = Dom.ancestors
343
344(** Get text content of a node and its descendants. *)
345let get_text_content = Dom.get_text_content
346
347(** Clone a node.
348 @param deep If true, also clone descendants (default: false)
349*)
350let clone = Dom.clone
351
352(** {1 Node Predicates} *)
353
354(** Test if a node is an element. *)
355let is_element = Dom.is_element
356
357(** Test if a node is a text node. *)
358let is_text = Dom.is_text
359
360(** Test if a node is a comment node. *)
361let is_comment = Dom.is_comment
362
363(** Test if a node is a document node. *)
364let is_document = Dom.is_document
365
366(** Test if a node is a document fragment. *)
367let is_document_fragment = Dom.is_document_fragment
368
369(** Test if a node is a doctype node. *)
370let is_doctype = Dom.is_doctype
371
372(** Test if a node has children. *)
373let has_children = Dom.has_children