lib/html5rw/html5rw.ml at 81c4816404ceafd6d88e08303e3870f364dc0a32 · anil.recoil.org/ocaml-html5rw

OCaml HTML5 parser/serialiser based on Python's JustHTML
ocaml-html5rw / lib / html5rw / html5rw.ml
at 81c4816404ceafd6d88e08303e3870f364dc0a32 11 kB view raw
  1(*---------------------------------------------------------------------------
  2  Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
  3  SPDX-License-Identifier: MIT
  4 ---------------------------------------------------------------------------*)
  5
  6(** Html5rw - Pure OCaml HTML5 Parser
  7
  8    This module provides a complete HTML5 parsing solution following the
  9    WHATWG specification. It uses bytesrw for streaming input/output.
 10
 11    {2 Quick Start}
 12
 13    Parse HTML from a reader:
 14    {[
 15      open Bytesrw
 16      let reader = Bytes.Reader.of_string "<p>Hello, world!</p>" in
 17      let result = Html5rw.parse reader in
 18      let html = Html5rw.to_string result
 19    ]}
 20
 21    Parse from a file:
 22    {[
 23      open Bytesrw
 24      let ic = open_in "page.html" in
 25      let reader = Bytes.Reader.of_in_channel ic in
 26      let result = Html5rw.parse reader in
 27      close_in ic
 28    ]}
 29
 30    Query with CSS selectors:
 31    {[
 32      let result = Html5rw.parse reader in
 33      let divs = Html5rw.query result "div.content"
 34    ]}
 35*)
 36
 37(** {1 Error Handling} *)
 38
 39(** Global error type that wraps all errors raised by the Html5rw library.
 40
 41    This provides a unified error type for all parsing and selector errors,
 42    along with printers for display and debugging.
 43*)
 44module Error = struct
 45  (** The unified error type for the Html5rw library. *)
 46  type t =
 47    | Parse_error of {
 48        code : Parse_error_code.t;
 49        line : int;
 50        column : int;
 51      }
 52        (** An HTML parse error, including location information. *)
 53    | Selector_error of Selector.Error_code.t
 54        (** A CSS selector parse error. *)
 55
 56  let of_parse_error (err : Parser.parse_error) : t =
 57    Parse_error {
 58      code = Parser.error_code err;
 59      line = Parser.error_line err;
 60      column = Parser.error_column err;
 61    }
 62
 63  let of_selector_error (code : Selector.Error_code.t) : t =
 64    Selector_error code
 65
 66  let to_string = function
 67    | Parse_error { code; line; column } ->
 68        Printf.sprintf "Parse error at %d:%d: %s" line column
 69          (Parse_error_code.to_string code)
 70    | Selector_error code ->
 71        Printf.sprintf "Selector error: %s"
 72          (Selector.Error_code.to_human_string code)
 73
 74  let pp fmt err = Format.pp_print_string fmt (to_string err)
 75
 76  (** Get the error code as a kebab-case string. *)
 77  let code_string = function
 78    | Parse_error { code; _ } -> Parse_error_code.to_string code
 79    | Selector_error code -> Selector.Error_code.to_string code
 80end
 81
 82(** {1 Sub-modules} *)
 83
 84(** Parse error code types *)
 85module Parse_error_code = Parse_error_code
 86
 87(** DOM types and manipulation functions *)
 88module Dom = Dom
 89
 90(** HTML5 tokenizer *)
 91module Tokenizer = Tokenizer
 92
 93(** Encoding detection and decoding *)
 94module Encoding = Encoding
 95
 96(** CSS selector engine *)
 97module Selector = Selector
 98
 99(** HTML entity decoding *)
100module Entities = Entities
101
102(** Low-level parser access *)
103module Parser = Parser
104
105(** {1 Core Types} *)
106
107(** DOM node type. See {!Dom} for manipulation functions. *)
108type node = Dom.node
109
110let pp_node = Dom.pp
111
112(** Doctype information *)
113type doctype_data = Dom.doctype_data = {
114  name : string option;
115  public_id : string option;
116  system_id : string option;
117}
118
119let pp_doctype_data = Dom.pp_doctype_data
120
121(** Quirks mode as determined during parsing *)
122type quirks_mode = Dom.quirks_mode = No_quirks | Quirks | Limited_quirks
123
124let pp_quirks_mode = Dom.pp_quirks_mode
125
126(** Character encoding detected or specified *)
127type encoding = Encoding.encoding =
128  | Utf8
129  | Utf16le
130  | Utf16be
131  | Windows_1252
132  | Iso_8859_2
133  | Euc_jp
134
135let pp_encoding = Encoding.pp
136
137(** Parse error record *)
138type parse_error = Parser.parse_error
139
140(** Fragment parsing context *)
141type fragment_context = Parser.fragment_context
142
143(** Create a fragment parsing context.
144    @param tag_name Tag name of the context element
145    @param namespace Namespace (None for HTML, Some "svg", Some "mathml")
146*)
147let make_fragment_context = Parser.make_fragment_context
148
149(** Get the tag name from a fragment context *)
150let fragment_context_tag = Parser.fragment_context_tag
151
152(** Get the namespace from a fragment context *)
153let fragment_context_namespace = Parser.fragment_context_namespace
154
155let pp_fragment_context = Parser.pp_fragment_context
156
157(** Get the error code *)
158let error_code = Parser.error_code
159
160(** Get the line number of an error (1-indexed) *)
161let error_line = Parser.error_line
162
163(** Get the column number of an error (1-indexed) *)
164let error_column = Parser.error_column
165
166let pp_parse_error = Parser.pp_parse_error
167
168(** Result of parsing an HTML document *)
169type t = {
170  root : node;
171  errors : parse_error list;
172  encoding : encoding option;
173}
174
175let pp fmt t =
176  Format.fprintf fmt "{root=%a; errors=%d; encoding=%a}"
177    pp_node t.root
178    (List.length t.errors)
179    (Format.pp_print_option pp_encoding) t.encoding
180
181(* Internal: convert Parser.t to our t *)
182let of_parser_result (p : Parser.t) : t =
183  { root = Parser.root p; errors = Parser.errors p; encoding = Parser.encoding p }
184
185(** {1 Parsing Functions} *)
186
187(** Parse HTML from a [Bytes.Reader.t].
188
189    This is the primary parsing function. Create a reader from any source:
190    - [Bytes.Reader.of_string s] for strings
191    - [Bytes.Reader.of_in_channel ic] for files
192    - [Bytes.Reader.of_bytes b] for byte buffers
193
194    {[
195      open Bytesrw
196      let reader = Bytes.Reader.of_string "<html><body>Hello</body></html>" in
197      let result = Html5rw.parse reader
198    ]}
199
200    @param collect_errors If true, collect parse errors (default: false)
201    @param fragment_context Context element for fragment parsing
202*)
203let parse ?collect_errors ?fragment_context reader =
204  of_parser_result (Parser.parse ?collect_errors ?fragment_context reader)
205
206(** Parse raw bytes with automatic encoding detection.
207
208    This function implements the WHATWG encoding sniffing algorithm:
209    1. Check for BOM (Byte Order Mark)
210    2. Prescan for <meta charset>
211    3. Fall back to UTF-8
212
213    @param collect_errors If true, collect parse errors (default: false)
214    @param transport_encoding Encoding from HTTP Content-Type header
215    @param fragment_context Context element for fragment parsing
216*)
217let parse_bytes ?collect_errors ?transport_encoding ?fragment_context bytes =
218  of_parser_result (Parser.parse_bytes ?collect_errors ?transport_encoding ?fragment_context bytes)
219
220(** {1 Querying} *)
221
222(** Query the DOM tree with a CSS selector.
223
224    Supported selectors:
225    - Tag: [div], [p], [span]
226    - ID: [#myid]
227    - Class: [.myclass]
228    - Universal: [*]
229    - Attribute: [[attr]], [[attr="value"]], [[attr~="value"]], [[attr|="value"]]
230    - Pseudo-classes: [:first-child], [:last-child], [:nth-child(n)]
231    - Combinators: descendant (space), child (>), adjacent sibling (+), general sibling (~)
232
233    {[
234      let divs = Html5rw.query result "div.content > p"
235    ]}
236
237    @raise Selector.Selector_error if the selector is invalid
238*)
239let query t selector = Selector.query t.root selector
240
241(** Check if a node matches a CSS selector. *)
242let matches node selector = Selector.matches node selector
243
244(** {1 Serialization} *)
245
246(** Write the DOM tree to a [Bytes.Writer.t].
247
248    {[
249      open Bytesrw
250      let buf = Buffer.create 1024 in
251      let writer = Bytes.Writer.of_buffer buf in
252      Html5rw.to_writer result writer;
253      Bytes.Writer.write_eod writer;
254      let html = Buffer.contents buf
255    ]}
256
257    @param pretty If true, format with indentation (default: true)
258    @param indent_size Number of spaces per indent level (default: 2)
259*)
260let to_writer ?pretty ?indent_size t writer =
261  Dom.to_writer ?pretty ?indent_size writer t.root
262
263(** Serialize the DOM tree to a string.
264
265    Convenience function when the output fits in memory.
266
267    @param pretty If true, format with indentation (default: true)
268    @param indent_size Number of spaces per indent level (default: 2)
269*)
270let to_string ?pretty ?indent_size t = Dom.to_html ?pretty ?indent_size t.root
271
272(** Extract text content from the DOM tree.
273
274    @param separator String to insert between text nodes (default: " ")
275    @param strip If true, trim whitespace (default: true)
276*)
277let to_text ?separator ?strip t = Dom.to_text ?separator ?strip t.root
278
279(** Serialize to html5lib test format (for testing). *)
280let to_test_format t = Dom.to_test_format t.root
281
282(** {1 Result Accessors} *)
283
284(** Get the root node of the parsed document. *)
285let root t = t.root
286
287(** Get parse errors (if error collection was enabled). *)
288let errors t = t.errors
289
290(** Get the detected encoding (if parsed from bytes). *)
291let encoding t = t.encoding
292
293(** {1 DOM Utilities}
294
295    Common DOM operations are available directly. For the full API,
296    see the {!Dom} module.
297*)
298
299(** Create an element node.
300    @param namespace None for HTML, Some "svg" or Some "mathml" for foreign content
301    @param attrs List of (name, value) attribute pairs
302*)
303let create_element = Dom.create_element
304
305(** Create a text node. *)
306let create_text = Dom.create_text
307
308(** Create a comment node. *)
309let create_comment = Dom.create_comment
310
311(** Create an empty document node. *)
312let create_document = Dom.create_document
313
314(** Create a document fragment node. *)
315let create_document_fragment = Dom.create_document_fragment
316
317(** Create a doctype node. *)
318let create_doctype = Dom.create_doctype
319
320(** Append a child node to a parent. *)
321let append_child = Dom.append_child
322
323(** Insert a node before a reference node. *)
324let insert_before = Dom.insert_before
325
326(** Remove a child node from its parent. *)
327let remove_child = Dom.remove_child
328
329(** Get an attribute value. *)
330let get_attr = Dom.get_attr
331
332(** Set an attribute value. *)
333let set_attr = Dom.set_attr
334
335(** Check if a node has an attribute. *)
336let has_attr = Dom.has_attr
337
338(** Get all descendant nodes. *)
339let descendants = Dom.descendants
340
341(** Get all ancestor nodes (from parent to root). *)
342let ancestors = Dom.ancestors
343
344(** Get text content of a node and its descendants. *)
345let get_text_content = Dom.get_text_content
346
347(** Clone a node.
348    @param deep If true, also clone descendants (default: false)
349*)
350let clone = Dom.clone
351
352(** {1 Node Predicates} *)
353
354(** Test if a node is an element. *)
355let is_element = Dom.is_element
356
357(** Test if a node is a text node. *)
358let is_text = Dom.is_text
359
360(** Test if a node is a comment node. *)
361let is_comment = Dom.is_comment
362
363(** Test if a node is a document node. *)
364let is_document = Dom.is_document
365
366(** Test if a node is a document fragment. *)
367let is_document_fragment = Dom.is_document_fragment
368
369(** Test if a node is a doctype node. *)
370let is_doctype = Dom.is_doctype
371
372(** Test if a node has children. *)
373let has_children = Dom.has_children