OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** Html5rw - Pure OCaml HTML5 Parser
7
8 This library provides a complete HTML5 parsing solution that implements the
9 {{:https://html.spec.whatwg.org/multipage/parsing.html} WHATWG HTML5
10 parsing specification}. It can parse any HTML document - well-formed or not -
11 and produce a DOM (Document Object Model) tree that matches browser behavior.
12
13 {2 What is HTML?}
14
15 HTML (HyperText Markup Language) is the standard markup language for creating
16 web pages. An HTML document consists of nested {i elements} that describe
17 the structure and content of the page:
18
19 {v
20 <!DOCTYPE html>
21 <html>
22 <head>
23 <title>My Page</title>
24 </head>
25 <body>
26 <h1>Welcome</h1>
27 <p>Hello, <b>world</b>!</p>
28 </body>
29 </html>
30 v}
31
32 Each element is written with a {i start tag} (like [<p>]), content, and an
33 {i end tag} (like [</p>]). Elements can have {i attributes} that provide
34 additional information: [<a href="https://example.com">].
35
36 @see <https://html.spec.whatwg.org/multipage/introduction.html>
37 WHATWG: Introduction to HTML
38
39 {2 The DOM}
40
41 When this parser processes HTML, it doesn't just store the text. Instead,
42 it builds a tree structure called the DOM (Document Object Model). Each
43 element, text fragment, and comment becomes a {i node} in this tree:
44
45 {v
46 Document
47 └── html
48 ├── head
49 │ └── title
50 │ └── #text "My Page"
51 └── body
52 ├── h1
53 │ └── #text "Welcome"
54 └── p
55 ├── #text "Hello, "
56 ├── b
57 │ └── #text "world"
58 └── #text "!"
59 v}
60
61 This tree can be traversed, searched, and modified. The {!Dom} module
62 provides types and functions for working with DOM nodes.
63
64 @see <https://html.spec.whatwg.org/multipage/dom.html>
65 WHATWG: The elements of HTML (DOM chapter)
66
67 {2 Quick Start}
68
69 Parse HTML from a string:
70 {[
71 open Bytesrw
72 let reader = Bytes.Reader.of_string "<p>Hello, world!</p>" in
73 let result = Html5rw.parse reader in
74 let html = Html5rw.to_string result
75 ]}
76
77 Parse from a file:
78 {[
79 open Bytesrw
80 let ic = open_in "page.html" in
81 let reader = Bytes.Reader.of_in_channel ic in
82 let result = Html5rw.parse reader in
83 close_in ic
84 ]}
85
86 Query with CSS selectors:
87 {[
88 let result = Html5rw.parse reader in
89 let divs = Html5rw.query result "div.content"
90 ]}
91
92 {2 Error Handling}
93
94 Unlike many parsers, HTML5 parsing {b never fails}. The WHATWG specification
95 defines error recovery rules for every possible malformed input, ensuring
96 all HTML documents produce a valid DOM tree (just as browsers do).
97
98 For example, parsing [<p>Hello<p>World] produces two paragraphs, not an
99 error, because [<p>] implicitly closes the previous [<p>].
100
101 If you need to detect malformed HTML (e.g., for validation), enable error
102 collection with [~collect_errors:true]. Errors are advisory - the parsing
103 still succeeds.
104
105 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
106 WHATWG: Parse errors
107
108 {2 HTML vs XHTML}
109
110 This parser implements {b HTML5 parsing}, not XHTML parsing. Key differences:
111
112 - Tag and attribute names are case-insensitive ([<DIV>] equals [<div>])
113 - Some end tags are optional ([<p>Hello] is valid)
114 - Void elements have no end tag ([<br>], not [<br/>] or [<br></br>])
115 - Boolean attributes need no value ([<input disabled>])
116
117 XHTML uses stricter XML rules. If you need XHTML parsing, use an XML parser.
118
119 @see <https://html.spec.whatwg.org/multipage/syntax.html>
120 WHATWG: The HTML syntax
121*)
122
123(** {1 Sub-modules} *)
124
125(** Parse error code types.
126
127 This module provides the {!Parse_error_code.t} variant type that represents
128 all WHATWG-defined parse errors plus tree construction errors.
129
130 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
131 WHATWG: Parse errors *)
132module Parse_error_code = Parse_error_code
133
134(** DOM types and manipulation functions.
135
136 This module provides the core types for representing HTML documents as
137 DOM trees. It includes:
138 - The {!Dom.node} type representing all kinds of DOM nodes
139 - Functions to create, modify, and traverse nodes
140 - Serialization functions to convert DOM back to HTML
141
142 @see <https://html.spec.whatwg.org/multipage/dom.html>
143 WHATWG: The elements of HTML *)
144module Dom = Dom
145
146(** HTML5 tokenizer.
147
148 The tokenizer is the first stage of HTML5 parsing. It converts a stream
149 of characters into a stream of {i tokens}: start tags, end tags, text,
150 comments, and DOCTYPEs.
151
152 Most users don't need to use the tokenizer directly - the {!parse}
153 function handles everything. The tokenizer is exposed for advanced use
154 cases like syntax highlighting or partial parsing.
155
156 @see <https://html.spec.whatwg.org/multipage/parsing.html#tokenization>
157 WHATWG: Tokenization *)
158module Tokenizer = Tokenizer
159
160(** Encoding detection and decoding.
161
162 HTML documents can use various character encodings (UTF-8, ISO-8859-1,
163 etc.). This module implements the WHATWG encoding sniffing algorithm
164 that browsers use to detect the encoding of a document:
165
166 1. Check for a BOM (Byte Order Mark)
167 2. Look for a [<meta charset>] declaration
168 3. Use HTTP Content-Type header hint (if available)
169 4. Fall back to UTF-8
170
171 @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
172 WHATWG: Determining the character encoding
173 @see <https://encoding.spec.whatwg.org/>
174 WHATWG Encoding Standard *)
175module Encoding = Encoding
176
177(** CSS selector engine.
178
179 This module provides CSS selector support for querying the DOM tree.
180 CSS selectors are patterns used to select HTML elements based on their
181 tag names, attributes, classes, IDs, and position in the document.
182
183 Example selectors:
184 - [div] - all [<div>] elements
185 - [#header] - element with [id="header"]
186 - [.warning] - elements with [class="warning"]
187 - [div > p] - [<p>] elements that are direct children of [<div>]
188 - [[href]] - elements with an [href] attribute
189
190 @see <https://www.w3.org/TR/selectors-4/>
191 W3C Selectors Level 4 specification *)
192module Selector = Selector
193
194(** HTML entity decoding.
195
196 HTML uses {i character references} to represent characters that are
197 hard to type or have special meaning:
198
199 - Named references: [&] (ampersand), [<] (less than), [ ] (non-breaking space)
200 - Decimal references: [<] (less than as decimal 60)
201 - Hexadecimal references: [<] (less than as hex 3C)
202
203 This module decodes all 2,231 named character references defined in
204 the WHATWG specification, plus numeric references.
205
206 @see <https://html.spec.whatwg.org/multipage/named-characters.html>
207 WHATWG: Named character references *)
208module Entities = Entities
209
210(** Low-level parser access.
211
212 This module exposes the internals of the HTML5 parser for advanced use.
213 Most users should use the top-level {!parse} function instead.
214
215 The parser exposes:
216 - Insertion modes for the tree construction algorithm
217 - The tree builder state machine
218 - Lower-level parsing functions
219
220 @see <https://html.spec.whatwg.org/multipage/parsing.html#tree-construction>
221 WHATWG: Tree construction *)
222module Parser = Parser
223
224(** {1 Core Types} *)
225
226(** DOM node type.
227
228 A node represents one part of an HTML document. Nodes form a tree
229 structure with parent/child relationships. There are several kinds:
230
231 - {b Element nodes}: HTML tags like [<div>], [<p>], [<a>]
232 - {b Text nodes}: Text content within elements
233 - {b Comment nodes}: HTML comments [<!-- ... -->]
234 - {b Document nodes}: The root of a document tree
235 - {b Document fragment nodes}: Lightweight containers
236 - {b Doctype nodes}: The [<!DOCTYPE html>] declaration
237
238 See {!Dom} for manipulation functions.
239
240 @see <https://html.spec.whatwg.org/multipage/dom.html>
241 WHATWG: The DOM *)
242type node = Dom.node
243
244val pp_node : Format.formatter -> node -> unit
245(** Pretty-print a DOM node. Prints a summary representation showing the
246 node type and key attributes. Does not recursively print children. *)
247
248(** DOCTYPE information.
249
250 The DOCTYPE declaration ([<!DOCTYPE html>]) appears at the start of HTML
251 documents. It tells browsers to use standards mode for rendering.
252
253 In HTML5, the DOCTYPE is minimal - just [<!DOCTYPE html>] with no public
254 or system identifiers. Legacy DOCTYPEs may have additional fields.
255
256 @see <https://html.spec.whatwg.org/multipage/syntax.html#the-doctype>
257 WHATWG: The DOCTYPE *)
258type doctype_data = Dom.doctype_data = {
259 name : string option;
260 (** DOCTYPE name, typically ["html"] *)
261
262 public_id : string option;
263 (** Public identifier for legacy DOCTYPEs (e.g., XHTML, HTML4) *)
264
265 system_id : string option;
266 (** System identifier (URL) for legacy DOCTYPEs *)
267}
268
269val pp_doctype_data : Format.formatter -> doctype_data -> unit
270(** Pretty-print DOCTYPE data. *)
271
272(** Source location for nodes.
273
274 Records the line and column where a node was found in the source HTML.
275 The end position is optional for nodes like text that may span multiple
276 locations. *)
277type location = Dom.location = {
278 line : int;
279 (** 1-indexed line number where the node starts *)
280
281 column : int;
282 (** 1-indexed column number where the node starts *)
283
284 end_line : int option;
285 (** Optional line number where the node ends *)
286
287 end_column : int option;
288 (** Optional column number where the node ends *)
289}
290
291val make_location : line:int -> column:int -> ?end_line:int -> ?end_column:int -> unit -> location
292(** Create a location. *)
293
294val get_location : node -> location option
295(** Get the source location for a node, if set. *)
296
297val set_location : node -> line:int -> column:int -> ?end_line:int -> ?end_column:int -> unit -> unit
298(** Set the source location for a node. *)
299
300(** Quirks mode as determined during parsing.
301
302 {i Quirks mode} controls how browsers render CSS and compute layouts.
303 It exists for backwards compatibility with old web pages that relied
304 on browser bugs.
305
306 - {b No_quirks}: Standards mode. The document is rendered according to
307 modern HTML5 and CSS specifications. Triggered by [<!DOCTYPE html>].
308
309 - {b Quirks}: Full quirks mode. The browser emulates bugs from older
310 browsers (primarily IE5). Triggered by missing or malformed DOCTYPEs.
311 Affects CSS box model, table layout, font inheritance, and more.
312
313 - {b Limited_quirks}: Almost standards mode. Only a few specific quirks
314 are applied, mainly affecting table cell vertical alignment.
315
316 {b Recommendation:} Always use [<!DOCTYPE html>] to ensure standards mode.
317
318 @see <https://quirks.spec.whatwg.org/>
319 Quirks Mode Standard
320 @see <https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode>
321 WHATWG: How quirks mode is determined *)
322type quirks_mode = Dom.quirks_mode = No_quirks | Quirks | Limited_quirks
323
324val pp_quirks_mode : Format.formatter -> quirks_mode -> unit
325(** Pretty-print quirks mode. *)
326
327(** Character encoding detected or specified.
328
329 HTML documents are sequences of bytes that must be decoded into characters.
330 Different encodings interpret the same bytes differently. For example:
331
332 - UTF-8: The modern standard, supporting all Unicode characters
333 - Windows-1252: Common on older Western European web pages
334 - ISO-8859-2: Used for Central European languages
335 - UTF-16: Used by some Windows applications
336
337 The parser detects encoding automatically when using {!parse_bytes}.
338 The detected encoding is available via {!val-encoding}.
339
340 @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
341 WHATWG: Determining the character encoding
342 @see <https://encoding.spec.whatwg.org/>
343 WHATWG Encoding Standard *)
344type encoding = Encoding.encoding =
345 | Utf8
346 (** UTF-8: The dominant encoding for the web, supporting all Unicode *)
347
348 | Utf16le
349 (** UTF-16 Little-Endian: 16-bit encoding, used by Windows *)
350
351 | Utf16be
352 (** UTF-16 Big-Endian: 16-bit encoding, network byte order *)
353
354 | Windows_1252
355 (** Windows-1252 (CP-1252): Western European, superset of ISO-8859-1 *)
356
357 | Iso_8859_2
358 (** ISO-8859-2: Central European (Polish, Czech, Hungarian, etc.) *)
359
360 | Euc_jp
361 (** EUC-JP: Extended Unix Code for Japanese *)
362
363val pp_encoding : Format.formatter -> encoding -> unit
364(** Pretty-print an encoding using its canonical label. *)
365
366(** A parse error encountered during HTML5 parsing.
367
368 HTML5 parsing {b never fails} - the specification defines error recovery
369 for all malformed input. However, conformance checkers can report these
370 errors. Enable error collection with [~collect_errors:true] if you want
371 to detect malformed HTML.
372
373 {b Common parse errors:}
374
375 - ["unexpected-null-character"]: Null byte in the input
376 - ["eof-before-tag-name"]: File ended while reading a tag
377 - ["unexpected-character-in-attribute-name"]: Invalid attribute syntax
378 - ["missing-doctype"]: Document started without [<!DOCTYPE>]
379 - ["duplicate-attribute"]: Same attribute appears twice on an element
380
381 The full list of parse error codes is defined in the WHATWG specification.
382
383 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
384 WHATWG: Complete list of parse errors *)
385type parse_error = Parser.parse_error
386
387(** Get the error code.
388
389 Returns the {!Parse_error_code.t} variant representing this error.
390 This allows pattern matching on specific error types:
391
392 {[
393 match Html5rw.error_code err with
394 | Parse_error_code.Unexpected_null_character -> (* handle *)
395 | Parse_error_code.Eof_in_tag -> (* handle *)
396 | Parse_error_code.Tree_construction_error msg -> (* handle tree error *)
397 | _ -> (* other *)
398 ]}
399
400 Use {!Parse_error_code.to_string} to convert to a string representation.
401
402 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
403 WHATWG: Parse error codes *)
404val error_code : parse_error -> Parse_error_code.t
405
406(** Get the line number where the error occurred (1-indexed).
407
408 Line numbers count from 1 and increment at each newline character. *)
409val error_line : parse_error -> int
410
411(** Get the column number where the error occurred (1-indexed).
412
413 Column numbers count from 1 and reset at each newline. *)
414val error_column : parse_error -> int
415
416val pp_parse_error : Format.formatter -> parse_error -> unit
417(** Pretty-print a parse error with location information. *)
418
419(** {1 Error Handling} *)
420
421(** Global error type that wraps all errors raised by the Html5rw library.
422
423 This module provides a unified error type for all parsing and selector
424 errors, along with printers and conversion functions. Use this when you
425 want to handle all possible errors from the library in a uniform way.
426
427 {2 Usage}
428
429 {[
430 (* Converting parse errors *)
431 let errors = Html5rw.errors result in
432 List.iter (fun err ->
433 let unified = Html5rw.Error.of_parse_error err in
434 Printf.eprintf "%s\n" (Html5rw.Error.to_string unified)
435 ) errors
436
437 (* Catching selector errors *)
438 match Html5rw.query result selector with
439 | nodes -> (* success *)
440 | exception Html5rw.Selector.Selector_error code ->
441 let unified = Html5rw.Error.of_selector_error code in
442 Printf.eprintf "%s\n" (Html5rw.Error.to_string unified)
443 ]}
444*)
445module Error : sig
446 (** The unified error type for the Html5rw library. *)
447 type t =
448 | Parse_error of {
449 code : Parse_error_code.t;
450 line : int;
451 column : int;
452 }
453 (** An HTML parse error, including location information.
454
455 Parse errors occur during HTML tokenization and tree construction.
456 The location indicates where in the input the error was detected.
457
458 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
459 WHATWG: Parse errors *)
460
461 | Selector_error of Selector.Error_code.t
462 (** A CSS selector parse error.
463
464 Selector errors occur when parsing malformed CSS selectors passed
465 to {!query} or {!matches}. *)
466
467 val of_parse_error : parse_error -> t
468 (** Convert a parse error to the unified error type.
469
470 {[
471 let errors = Html5rw.errors result in
472 let unified_errors = List.map Html5rw.Error.of_parse_error errors
473 ]} *)
474
475 val of_selector_error : Selector.Error_code.t -> t
476 (** Convert a selector error code to the unified error type.
477
478 {[
479 match Html5rw.query result "invalid[" with
480 | _ -> ()
481 | exception Html5rw.Selector.Selector_error code ->
482 let err = Html5rw.Error.of_selector_error code in
483 Printf.eprintf "%s\n" (Html5rw.Error.to_string err)
484 ]} *)
485
486 val to_string : t -> string
487 (** Convert to a human-readable error message with location information.
488
489 Examples:
490 - ["Parse error at 5:12: unexpected-null-character"]
491 - ["Selector error: Expected \]"] *)
492
493 val pp : Format.formatter -> t -> unit
494 (** Pretty-printer for use with [Format] functions. *)
495
496 val code_string : t -> string
497 (** Get just the error code as a kebab-case string (without location).
498
499 This is useful for programmatic error handling or logging.
500
501 Examples:
502 - ["unexpected-null-character"]
503 - ["expected-closing-bracket"] *)
504end
505
506(** {1 Fragment Parsing} *)
507
508(** Context element for HTML fragment parsing (innerHTML).
509
510 When parsing HTML fragments (like the [innerHTML] of an element), you
511 must specify what element would contain the fragment. This affects how
512 the parser handles certain elements.
513
514 {b Why context matters:}
515
516 HTML parsing rules depend on where content appears. For example:
517 - [<td>] is valid inside [<tr>] but not inside [<div>]
518 - [<li>] is valid inside [<ul>] but creates implied lists elsewhere
519 - Content inside [<table>] has special parsing rules
520
521 {b Example:}
522 {[
523 (* Parse as if content were inside a <ul> *)
524 let ctx = make_fragment_context ~tag_name:"ul" () in
525 let result = parse ~fragment_context:ctx reader
526 (* Now <li> elements are parsed correctly *)
527 ]}
528
529 @see <https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments>
530 WHATWG: The fragment parsing algorithm *)
531type fragment_context = Parser.fragment_context
532
533(** Create a fragment parsing context.
534
535 The context element determines how the parser interprets the fragment.
536 Choose a context that matches where the fragment would be inserted.
537
538 @param tag_name Tag name of the context element (e.g., ["div"], ["tr"],
539 ["ul"]). This is the element that would contain the fragment.
540 @param namespace Namespace of the context element:
541 - [None] (default): HTML namespace
542 - [Some "svg"]: SVG namespace
543 - [Some "mathml"]: MathML namespace
544
545 {b Examples:}
546 {[
547 (* Parse as innerHTML of a <div> (most common case) *)
548 let ctx = make_fragment_context ~tag_name:"div" ()
549
550 (* Parse as innerHTML of a <ul> - <li> elements work correctly *)
551 let ctx = make_fragment_context ~tag_name:"ul" ()
552
553 (* Parse as innerHTML of an SVG <g> element *)
554 let ctx = make_fragment_context ~tag_name:"g" ~namespace:(Some "svg") ()
555
556 (* Parse as innerHTML of a <table> - table-specific rules apply *)
557 let ctx = make_fragment_context ~tag_name:"table" ()
558 ]}
559
560 @see <https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments>
561 WHATWG: Fragment parsing algorithm *)
562val make_fragment_context : tag_name:string -> ?namespace:string option ->
563 unit -> fragment_context
564
565(** Get the tag name of a fragment context. *)
566val fragment_context_tag : fragment_context -> string
567
568(** Get the namespace of a fragment context. *)
569val fragment_context_namespace : fragment_context -> string option
570
571val pp_fragment_context : Format.formatter -> fragment_context -> unit
572(** Pretty-print a fragment context. *)
573
574(** Result of parsing an HTML document.
575
576 This record contains everything produced by parsing:
577 - The DOM tree (accessible via {!val-root})
578 - Any parse errors (accessible via {!val-errors})
579 - The detected encoding (accessible via {!val-encoding})
580*)
581type t = {
582 root : node;
583 (** Root node of the parsed document tree.
584
585 For full document parsing, this is a Document node containing the
586 DOCTYPE (if any) and [<html>] element.
587
588 For fragment parsing, this is a Document Fragment containing the
589 parsed elements. *)
590
591 errors : parse_error list;
592 (** Parse errors encountered during parsing.
593
594 This list is empty unless [~collect_errors:true] was passed to the
595 parse function. Errors are in the order they were encountered.
596
597 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
598 WHATWG: Parse errors *)
599
600 encoding : encoding option;
601 (** Character encoding detected during parsing.
602
603 This is [Some encoding] when using {!parse_bytes} with automatic
604 encoding detection, and [None] when using {!parse} (which expects
605 pre-decoded UTF-8 input). *)
606}
607
608val pp : Format.formatter -> t -> unit
609(** Pretty-print a parse result summary. *)
610
611(** {1 Parsing Functions} *)
612
613(** Parse HTML from a [Bytes.Reader.t].
614
615 This is the primary parsing function. It reads bytes from the provided
616 reader and returns a DOM tree. The input should be valid UTF-8.
617
618 {b Creating readers:}
619 {[
620 open Bytesrw
621
622 (* From a string *)
623 let reader = Bytes.Reader.of_string html_string
624
625 (* From a file *)
626 let ic = open_in "page.html" in
627 let reader = Bytes.Reader.of_in_channel ic
628
629 (* From a buffer *)
630 let reader = Bytes.Reader.of_buffer buf
631 ]}
632
633 {b Parsing a complete document:}
634 {[
635 let result = Html5rw.parse reader
636 let doc = Html5rw.root result
637 ]}
638
639 {b Parsing a fragment:}
640 {[
641 let ctx = Html5rw.make_fragment_context ~tag_name:"div" () in
642 let result = Html5rw.parse ~fragment_context:ctx reader
643 ]}
644
645 @param collect_errors If [true], collect parse errors. Default: [false].
646 Error collection has some performance overhead.
647 @param fragment_context Context element for fragment parsing. If provided,
648 the input is parsed as a fragment (like innerHTML) rather than
649 a complete document.
650
651 @see <https://html.spec.whatwg.org/multipage/parsing.html>
652 WHATWG: HTML parsing algorithm *)
653val parse : ?collect_errors:bool -> ?fragment_context:fragment_context ->
654 Bytesrw.Bytes.Reader.t -> t
655
656(** Parse raw bytes with automatic encoding detection.
657
658 This function is useful when you have raw bytes and don't know the
659 character encoding. It implements the WHATWG encoding sniffing algorithm:
660
661 1. {b BOM detection}: Check for UTF-8, UTF-16LE, or UTF-16BE BOM
662 2. {b Prescan}: Look for [<meta charset="...">] in the first 1024 bytes
663 3. {b Transport hint}: Use the provided [transport_encoding] if any
664 4. {b Fallback}: Use UTF-8 (the modern web default)
665
666 The detected encoding is stored in the result's [encoding] field.
667
668 {b Example:}
669 {[
670 let bytes = really_input_bytes ic (in_channel_length ic) in
671 let result = Html5rw.parse_bytes bytes in
672 match Html5rw.encoding result with
673 | Some Utf8 -> print_endline "UTF-8 detected"
674 | Some Windows_1252 -> print_endline "Windows-1252 detected"
675 | _ -> ()
676 ]}
677
678 @param collect_errors If [true], collect parse errors. Default: [false].
679 @param transport_encoding Encoding hint from HTTP Content-Type header.
680 For example, if the server sends [Content-Type: text/html; charset=utf-8],
681 pass [~transport_encoding:"utf-8"].
682 @param fragment_context Context element for fragment parsing.
683
684 @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
685 WHATWG: Determining the character encoding *)
686val parse_bytes : ?collect_errors:bool -> ?transport_encoding:string ->
687 ?fragment_context:fragment_context -> bytes -> t
688
689(** {1 Querying} *)
690
691(** Query the DOM tree with a CSS selector.
692
693 CSS selectors are patterns used to select elements in HTML documents.
694 This function returns all nodes matching the selector, in document order.
695
696 {b Supported selectors:}
697
698 {i Type selectors:}
699 - [div], [p], [span] - elements by tag name
700
701 {i Class and ID selectors:}
702 - [#myid] - element with [id="myid"]
703 - [.myclass] - elements with class containing "myclass"
704
705 {i Attribute selectors:}
706 - [[attr]] - elements with the [attr] attribute
707 - [[attr="value"]] - attribute equals value
708 - [[attr~="value"]] - attribute contains word
709 - [[attr|="value"]] - attribute starts with value or value-
710 - [[attr^="value"]] - attribute starts with value
711 - [[attr$="value"]] - attribute ends with value
712 - [[attr*="value"]] - attribute contains value
713
714 {i Pseudo-classes:}
715 - [:first-child], [:last-child] - first/last child of parent
716 - [:nth-child(n)] - nth child (1-indexed)
717 - [:only-child] - only child of parent
718 - [:empty] - elements with no children
719 - [:not(selector)] - elements not matching selector
720
721 {i Combinators:}
722 - [A B] - B descendants of A (any depth)
723 - [A > B] - B direct children of A
724 - [A + B] - B immediately after A (adjacent sibling)
725 - [A ~ B] - B after A (general sibling)
726
727 {i Universal:}
728 - [*] - all elements
729
730 {b Examples:}
731 {[
732 (* All paragraphs *)
733 let ps = query result "p"
734
735 (* Elements with class "warning" inside a div *)
736 let warnings = query result "div .warning"
737
738 (* Direct children of nav that are links *)
739 let nav_links = query result "nav > a"
740
741 (* Complex selector *)
742 let items = query result "ul.menu > li:first-child a[href]"
743 ]}
744
745 @raise Selector.Selector_error if the selector syntax is invalid
746
747 @see <https://www.w3.org/TR/selectors-4/>
748 W3C: Selectors Level 4 *)
749val query : t -> string -> node list
750
751(** Check if a node matches a CSS selector.
752
753 This is useful for filtering nodes or implementing custom traversals.
754
755 {b Example:}
756 {[
757 let is_external_link node =
758 matches node "a[href^='http']"
759 ]}
760
761 @raise Selector.Selector_error if the selector syntax is invalid *)
762val matches : node -> string -> bool
763
764(** {1 Serialization} *)
765
766(** Write the DOM tree to a [Bytes.Writer.t].
767
768 This serializes the DOM back to HTML. The output is valid HTML5 that
769 can be parsed to produce an equivalent DOM tree.
770
771 {b Example:}
772 {[
773 open Bytesrw
774 let buf = Buffer.create 1024 in
775 let writer = Bytes.Writer.of_buffer buf in
776 Html5rw.to_writer result writer;
777 Bytes.Writer.write_eod writer;
778 let html = Buffer.contents buf
779 ]}
780
781 @param pretty If [true] (default), add indentation for readability.
782 If [false], output compact HTML with no added whitespace.
783 @param indent_size Spaces per indentation level (default: 2).
784 Only used when [pretty] is [true].
785
786 @see <https://html.spec.whatwg.org/multipage/parsing.html#serialising-html-fragments>
787 WHATWG: Serialising HTML fragments *)
788val to_writer : ?pretty:bool -> ?indent_size:int -> t ->
789 Bytesrw.Bytes.Writer.t -> unit
790
791(** Serialize the DOM tree to a string.
792
793 Convenience function that serializes to a string instead of a writer.
794 Use {!to_writer} for large documents to avoid memory allocation.
795
796 @param pretty If [true] (default), add indentation for readability.
797 @param indent_size Spaces per indentation level (default: 2). *)
798val to_string : ?pretty:bool -> ?indent_size:int -> t -> string
799
800(** Extract text content from the DOM tree.
801
802 This concatenates all text nodes in the document, producing a string
803 with just the readable text (no HTML tags).
804
805 {b Example:}
806 {[
807 (* For document: <div><p>Hello</p><p>World</p></div> *)
808 let text = to_text result
809 (* Returns: "Hello World" *)
810 ]}
811
812 @param separator String to insert between text nodes (default: [" "])
813 @param strip If [true] (default), trim leading/trailing whitespace *)
814val to_text : ?separator:string -> ?strip:bool -> t -> string
815
816(** Serialize to html5lib test format.
817
818 This produces the tree format used by the
819 {{:https://github.com/html5lib/html5lib-tests} html5lib-tests} suite.
820 Mainly useful for testing the parser against the reference tests. *)
821val to_test_format : t -> string
822
823(** {1 Result Accessors} *)
824
825(** Get the root node of the parsed document.
826
827 For full document parsing, this returns a Document node. The structure is:
828 {v
829 #document
830 ├── !doctype (if present)
831 └── html
832 ├── head
833 └── body
834 v}
835
836 For fragment parsing, this returns a Document Fragment node containing
837 the parsed elements directly. *)
838val root : t -> node
839
840(** Get parse errors (if error collection was enabled).
841
842 Returns an empty list if [~collect_errors:true] was not passed to the
843 parse function, or if the document was well-formed.
844
845 Errors are returned in the order they were encountered during parsing.
846
847 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
848 WHATWG: Parse errors *)
849val errors : t -> parse_error list
850
851(** Get the detected encoding (if parsed from bytes).
852
853 Returns [Some encoding] when {!parse_bytes} was used, indicating which
854 encoding was detected or specified. Returns [None] when {!parse} was
855 used, since it expects pre-decoded UTF-8 input.
856
857 @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
858 WHATWG: Determining the character encoding *)
859val encoding : t -> encoding option
860
861(** {1 DOM Utilities}
862
863 Common DOM operations are available directly on this module. For the
864 full API including more advanced operations, see the {!Dom} module.
865
866 @see <https://html.spec.whatwg.org/multipage/dom.html>
867 WHATWG: The elements of HTML
868*)
869
870(** Create an element node.
871
872 Elements are the building blocks of HTML documents. They represent tags
873 like [<div>], [<p>], [<a>], etc.
874
875 @param name Tag name (e.g., ["div"], ["p"], ["span"])
876 @param namespace Element namespace:
877 - [None] (default): HTML namespace
878 - [Some "svg"]: SVG namespace for graphics
879 - [Some "mathml"]: MathML namespace for math notation
880 @param attrs Initial attributes as [(name, value)] pairs
881
882 {b Example:}
883 {[
884 (* Simple element *)
885 let div = create_element "div" ()
886
887 (* Element with attributes *)
888 let link = create_element "a"
889 ~attrs:[("href", "/about"); ("class", "nav-link")]
890 ()
891 ]}
892
893 @see <https://html.spec.whatwg.org/multipage/dom.html#elements-in-the-dom>
894 WHATWG: Elements in the DOM *)
895val create_element : string -> ?namespace:string option ->
896 ?attrs:(string * string) list -> ?location:Dom.location -> unit -> node
897
898(** Create a text node.
899
900 Text nodes contain the readable text content of HTML documents.
901
902 {b Example:}
903 {[
904 let text = create_text "Hello, world!"
905 ]} *)
906val create_text : ?location:Dom.location -> string -> node
907
908(** Create a comment node.
909
910 Comments are preserved in the DOM but not rendered. They're written
911 as [<!-- text -->] in HTML.
912
913 @see <https://html.spec.whatwg.org/multipage/syntax.html#comments>
914 WHATWG: Comments *)
915val create_comment : ?location:Dom.location -> string -> node
916
917(** Create an empty document node.
918
919 The Document node is the root of an HTML document tree.
920
921 @see <https://html.spec.whatwg.org/multipage/dom.html#document>
922 WHATWG: The Document object *)
923val create_document : unit -> node
924
925(** Create a document fragment node.
926
927 Document fragments are lightweight containers for holding nodes
928 without a parent document. Used for template contents and fragment
929 parsing results.
930
931 @see <https://dom.spec.whatwg.org/#documentfragment>
932 DOM Standard: DocumentFragment *)
933val create_document_fragment : unit -> node
934
935(** Create a doctype node.
936
937 For HTML5 documents, use [create_doctype ~name:"html" ()].
938
939 @param name DOCTYPE name (usually ["html"])
940 @param public_id Public identifier (legacy)
941 @param system_id System identifier (legacy)
942
943 @see <https://html.spec.whatwg.org/multipage/syntax.html#the-doctype>
944 WHATWG: The DOCTYPE *)
945val create_doctype : ?name:string -> ?public_id:string ->
946 ?system_id:string -> ?location:location -> unit -> node
947
948(** Append a child node to a parent.
949
950 The child is added as the last child of the parent. If the child
951 already has a parent, it is first removed from that parent. *)
952val append_child : node -> node -> unit
953
954(** Insert a node before a reference node.
955
956 @param parent The parent node
957 @param new_child The node to insert
958 @param ref_child The existing child to insert before
959
960 Raises [Not_found] if [ref_child] is not a child of [parent]. *)
961val insert_before : node -> node -> node -> unit
962
963(** Remove a child node from its parent.
964
965 Raises [Not_found] if [child] is not a child of [parent]. *)
966val remove_child : node -> node -> unit
967
968(** Get an attribute value.
969
970 Returns [Some value] if the attribute exists, [None] otherwise.
971 Attribute names are case-sensitive (but were lowercased during parsing).
972
973 @see <https://html.spec.whatwg.org/multipage/dom.html#attributes>
974 WHATWG: Attributes *)
975val get_attr : node -> string -> string option
976
977(** Set an attribute value.
978
979 If the attribute exists, it is replaced. If not, it is added. *)
980val set_attr : node -> string -> string -> unit
981
982(** Check if a node has an attribute. *)
983val has_attr : node -> string -> bool
984
985(** Get all descendant nodes in document order.
986
987 Returns all nodes below this node in the tree, in the order they
988 appear in the HTML source (depth-first). *)
989val descendants : node -> node list
990
991(** Get all ancestor nodes from parent to root.
992
993 Returns the chain of parent nodes, starting with the immediate parent
994 and ending with the Document node. *)
995val ancestors : node -> node list
996
997(** Get text content of a node and its descendants.
998
999 For text nodes, returns the text directly. For elements, recursively
1000 concatenates all descendant text content. *)
1001val get_text_content : node -> string
1002
1003(** Clone a node.
1004
1005 @param deep If [true], recursively clone all descendants.
1006 If [false] (default), only clone the node itself. *)
1007val clone : ?deep:bool -> node -> node
1008
1009(** {1 Node Predicates}
1010
1011 Functions to test what type of node you have.
1012*)
1013
1014(** Test if a node is an element.
1015
1016 Elements are HTML tags like [<div>], [<p>], [<a>]. *)
1017val is_element : node -> bool
1018
1019(** Test if a node is a text node.
1020
1021 Text nodes contain character content within elements. *)
1022val is_text : node -> bool
1023
1024(** Test if a node is a comment node.
1025
1026 Comment nodes represent HTML comments [<!-- ... -->]. *)
1027val is_comment : node -> bool
1028
1029(** Test if a node is a document node.
1030
1031 The document node is the root of a complete HTML document tree. *)
1032val is_document : node -> bool
1033
1034(** Test if a node is a document fragment.
1035
1036 Document fragments are lightweight containers for nodes. *)
1037val is_document_fragment : node -> bool
1038
1039(** Test if a node is a doctype node.
1040
1041 Doctype nodes represent the [<!DOCTYPE>] declaration. *)
1042val is_doctype : node -> bool
1043
1044(** Test if a node has children. *)
1045val has_children : node -> bool