OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** Html5rw - Pure OCaml HTML5 Parser
7
8 This library provides a complete HTML5 parsing solution that implements the
9 {{:https://html.spec.whatwg.org/multipage/parsing.html} WHATWG HTML5
10 parsing specification}. It can parse any HTML document - well-formed or not -
11 and produce a DOM (Document Object Model) tree that matches browser behavior.
12
13 {2 What is HTML?}
14
15 HTML (HyperText Markup Language) is the standard markup language for creating
16 web pages. An HTML document consists of nested {i elements} that describe
17 the structure and content of the page:
18
19 {v
20 <!DOCTYPE html>
21 <html>
22 <head>
23 <title>My Page</title>
24 </head>
25 <body>
26 <h1>Welcome</h1>
27 <p>Hello, <b>world</b>!</p>
28 </body>
29 </html>
30 v}
31
32 Each element is written with a {i start tag} (like [<p>]), content, and an
33 {i end tag} (like [</p>]). Elements can have {i attributes} that provide
34 additional information: [<a href="https://example.com">].
35
36 @see <https://html.spec.whatwg.org/multipage/introduction.html>
37 WHATWG: Introduction to HTML
38
39 {2 The DOM}
40
41 When this parser processes HTML, it doesn't just store the text. Instead,
42 it builds a tree structure called the DOM (Document Object Model). Each
43 element, text fragment, and comment becomes a {i node} in this tree:
44
45 {v
46 Document
47 └── html
48 ├── head
49 │ └── title
50 │ └── #text "My Page"
51 └── body
52 ├── h1
53 │ └── #text "Welcome"
54 └── p
55 ├── #text "Hello, "
56 ├── b
57 │ └── #text "world"
58 └── #text "!"
59 v}
60
61 This tree can be traversed, searched, and modified. The {!Dom} module
62 provides types and functions for working with DOM nodes.
63
64 @see <https://html.spec.whatwg.org/multipage/dom.html>
65 WHATWG: The elements of HTML (DOM chapter)
66
67 {2 Quick Start}
68
69 Parse HTML from a string:
70 {[
71 open Bytesrw
72 let reader = Bytes.Reader.of_string "<p>Hello, world!</p>" in
73 let result = Html5rw.parse reader in
74 let html = Html5rw.to_string result
75 ]}
76
77 Parse from a file:
78 {[
79 open Bytesrw
80 let ic = open_in "page.html" in
81 let reader = Bytes.Reader.of_in_channel ic in
82 let result = Html5rw.parse reader in
83 close_in ic
84 ]}
85
86 Query with CSS selectors:
87 {[
88 let result = Html5rw.parse reader in
89 let divs = Html5rw.query result "div.content"
90 ]}
91
92 {2 Error Handling}
93
94 Unlike many parsers, HTML5 parsing {b never fails}. The WHATWG specification
95 defines error recovery rules for every possible malformed input, ensuring
96 all HTML documents produce a valid DOM tree (just as browsers do).
97
98 For example, parsing [<p>Hello<p>World] produces two paragraphs, not an
99 error, because [<p>] implicitly closes the previous [<p>].
100
101 If you need to detect malformed HTML (e.g., for validation), enable error
102 collection with [~collect_errors:true]. Errors are advisory - the parsing
103 still succeeds.
104
105 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
106 WHATWG: Parse errors
107
108 {2 HTML vs XHTML}
109
110 This parser implements {b HTML5 parsing}, not XHTML parsing. Key differences:
111
112 - Tag and attribute names are case-insensitive ([<DIV>] equals [<div>])
113 - Some end tags are optional ([<p>Hello] is valid)
114 - Void elements have no end tag ([<br>], not [<br/>] or [<br></br>])
115 - Boolean attributes need no value ([<input disabled>])
116
117 XHTML uses stricter XML rules. If you need XHTML parsing, use an XML parser.
118
119 @see <https://html.spec.whatwg.org/multipage/syntax.html>
120 WHATWG: The HTML syntax
121*)
122
123(** {1 Sub-modules} *)
124
125(** Parse error code types.
126
127 This module provides the {!Parse_error_code.t} variant type that represents
128 all WHATWG-defined parse errors plus tree construction errors.
129
130 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
131 WHATWG: Parse errors *)
132module Parse_error_code = Parse_error_code
133
134(** DOM types and manipulation functions.
135
136 This module provides the core types for representing HTML documents as
137 DOM trees. It includes:
138 - The {!Dom.node} type representing all kinds of DOM nodes
139 - Functions to create, modify, and traverse nodes
140 - Serialization functions to convert DOM back to HTML
141
142 @see <https://html.spec.whatwg.org/multipage/dom.html>
143 WHATWG: The elements of HTML *)
144module Dom = Dom
145
146(** HTML5 tokenizer.
147
148 The tokenizer is the first stage of HTML5 parsing. It converts a stream
149 of characters into a stream of {i tokens}: start tags, end tags, text,
150 comments, and DOCTYPEs.
151
152 Most users don't need to use the tokenizer directly - the {!parse}
153 function handles everything. The tokenizer is exposed for advanced use
154 cases like syntax highlighting or partial parsing.
155
156 @see <https://html.spec.whatwg.org/multipage/parsing.html#tokenization>
157 WHATWG: Tokenization *)
158module Tokenizer = Tokenizer
159
160(** Encoding detection and decoding.
161
162 HTML documents can use various character encodings (UTF-8, ISO-8859-1,
163 etc.). This module implements the WHATWG encoding sniffing algorithm
164 that browsers use to detect the encoding of a document:
165
166 1. Check for a BOM (Byte Order Mark)
167 2. Look for a [<meta charset>] declaration
168 3. Use HTTP Content-Type header hint (if available)
169 4. Fall back to UTF-8
170
171 @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
172 WHATWG: Determining the character encoding
173 @see <https://encoding.spec.whatwg.org/>
174 WHATWG Encoding Standard *)
175module Encoding = Encoding
176
177(** CSS selector engine.
178
179 This module provides CSS selector support for querying the DOM tree.
180 CSS selectors are patterns used to select HTML elements based on their
181 tag names, attributes, classes, IDs, and position in the document.
182
183 Example selectors:
184 - [div] - all [<div>] elements
185 - [#header] - element with [id="header"]
186 - [.warning] - elements with [class="warning"]
187 - [div > p] - [<p>] elements that are direct children of [<div>]
188 - [[href]] - elements with an [href] attribute
189
190 @see <https://www.w3.org/TR/selectors-4/>
191 W3C Selectors Level 4 specification *)
192module Selector = Selector
193
194(** HTML entity decoding.
195
196 HTML uses {i character references} to represent characters that are
197 hard to type or have special meaning:
198
199 - Named references: [&] (ampersand), [<] (less than), [ ] (non-breaking space)
200 - Decimal references: [<] (less than as decimal 60)
201 - Hexadecimal references: [<] (less than as hex 3C)
202
203 This module decodes all 2,231 named character references defined in
204 the WHATWG specification, plus numeric references.
205
206 @see <https://html.spec.whatwg.org/multipage/named-characters.html>
207 WHATWG: Named character references *)
208module Entities = Entities
209
210(** Low-level parser access.
211
212 This module exposes the internals of the HTML5 parser for advanced use.
213 Most users should use the top-level {!parse} function instead.
214
215 The parser exposes:
216 - Insertion modes for the tree construction algorithm
217 - The tree builder state machine
218 - Lower-level parsing functions
219
220 @see <https://html.spec.whatwg.org/multipage/parsing.html#tree-construction>
221 WHATWG: Tree construction *)
222module Parser = Parser
223
224(** {1 Core Types} *)
225
226(** DOM node type.
227
228 A node represents one part of an HTML document. Nodes form a tree
229 structure with parent/child relationships. There are several kinds:
230
231 - {b Element nodes}: HTML tags like [<div>], [<p>], [<a>]
232 - {b Text nodes}: Text content within elements
233 - {b Comment nodes}: HTML comments [<!-- ... -->]
234 - {b Document nodes}: The root of a document tree
235 - {b Document fragment nodes}: Lightweight containers
236 - {b Doctype nodes}: The [<!DOCTYPE html>] declaration
237
238 See {!Dom} for manipulation functions.
239
240 @see <https://html.spec.whatwg.org/multipage/dom.html>
241 WHATWG: The DOM *)
242type node = Dom.node
243
244val pp_node : Format.formatter -> node -> unit
245(** Pretty-print a DOM node. Prints a summary representation showing the
246 node type and key attributes. Does not recursively print children. *)
247
248(** DOCTYPE information.
249
250 The DOCTYPE declaration ([<!DOCTYPE html>]) appears at the start of HTML
251 documents. It tells browsers to use standards mode for rendering.
252
253 In HTML5, the DOCTYPE is minimal - just [<!DOCTYPE html>] with no public
254 or system identifiers. Legacy DOCTYPEs may have additional fields.
255
256 @see <https://html.spec.whatwg.org/multipage/syntax.html#the-doctype>
257 WHATWG: The DOCTYPE *)
258type doctype_data = Dom.doctype_data = {
259 name : string option;
260 (** DOCTYPE name, typically ["html"] *)
261
262 public_id : string option;
263 (** Public identifier for legacy DOCTYPEs (e.g., XHTML, HTML4) *)
264
265 system_id : string option;
266 (** System identifier (URL) for legacy DOCTYPEs *)
267}
268
269val pp_doctype_data : Format.formatter -> doctype_data -> unit
270(** Pretty-print DOCTYPE data. *)
271
272(** Quirks mode as determined during parsing.
273
274 {i Quirks mode} controls how browsers render CSS and compute layouts.
275 It exists for backwards compatibility with old web pages that relied
276 on browser bugs.
277
278 - {b No_quirks}: Standards mode. The document is rendered according to
279 modern HTML5 and CSS specifications. Triggered by [<!DOCTYPE html>].
280
281 - {b Quirks}: Full quirks mode. The browser emulates bugs from older
282 browsers (primarily IE5). Triggered by missing or malformed DOCTYPEs.
283 Affects CSS box model, table layout, font inheritance, and more.
284
285 - {b Limited_quirks}: Almost standards mode. Only a few specific quirks
286 are applied, mainly affecting table cell vertical alignment.
287
288 {b Recommendation:} Always use [<!DOCTYPE html>] to ensure standards mode.
289
290 @see <https://quirks.spec.whatwg.org/>
291 Quirks Mode Standard
292 @see <https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode>
293 WHATWG: How quirks mode is determined *)
294type quirks_mode = Dom.quirks_mode = No_quirks | Quirks | Limited_quirks
295
296val pp_quirks_mode : Format.formatter -> quirks_mode -> unit
297(** Pretty-print quirks mode. *)
298
299(** Character encoding detected or specified.
300
301 HTML documents are sequences of bytes that must be decoded into characters.
302 Different encodings interpret the same bytes differently. For example:
303
304 - UTF-8: The modern standard, supporting all Unicode characters
305 - Windows-1252: Common on older Western European web pages
306 - ISO-8859-2: Used for Central European languages
307 - UTF-16: Used by some Windows applications
308
309 The parser detects encoding automatically when using {!parse_bytes}.
310 The detected encoding is available via {!val-encoding}.
311
312 @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
313 WHATWG: Determining the character encoding
314 @see <https://encoding.spec.whatwg.org/>
315 WHATWG Encoding Standard *)
316type encoding = Encoding.encoding =
317 | Utf8
318 (** UTF-8: The dominant encoding for the web, supporting all Unicode *)
319
320 | Utf16le
321 (** UTF-16 Little-Endian: 16-bit encoding, used by Windows *)
322
323 | Utf16be
324 (** UTF-16 Big-Endian: 16-bit encoding, network byte order *)
325
326 | Windows_1252
327 (** Windows-1252 (CP-1252): Western European, superset of ISO-8859-1 *)
328
329 | Iso_8859_2
330 (** ISO-8859-2: Central European (Polish, Czech, Hungarian, etc.) *)
331
332 | Euc_jp
333 (** EUC-JP: Extended Unix Code for Japanese *)
334
335val pp_encoding : Format.formatter -> encoding -> unit
336(** Pretty-print an encoding using its canonical label. *)
337
338(** A parse error encountered during HTML5 parsing.
339
340 HTML5 parsing {b never fails} - the specification defines error recovery
341 for all malformed input. However, conformance checkers can report these
342 errors. Enable error collection with [~collect_errors:true] if you want
343 to detect malformed HTML.
344
345 {b Common parse errors:}
346
347 - ["unexpected-null-character"]: Null byte in the input
348 - ["eof-before-tag-name"]: File ended while reading a tag
349 - ["unexpected-character-in-attribute-name"]: Invalid attribute syntax
350 - ["missing-doctype"]: Document started without [<!DOCTYPE>]
351 - ["duplicate-attribute"]: Same attribute appears twice on an element
352
353 The full list of parse error codes is defined in the WHATWG specification.
354
355 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
356 WHATWG: Complete list of parse errors *)
357type parse_error = Parser.parse_error
358
359(** Get the error code.
360
361 Returns the {!Parse_error_code.t} variant representing this error.
362 This allows pattern matching on specific error types:
363
364 {[
365 match Html5rw.error_code err with
366 | Parse_error_code.Unexpected_null_character -> (* handle *)
367 | Parse_error_code.Eof_in_tag -> (* handle *)
368 | Parse_error_code.Tree_construction_error msg -> (* handle tree error *)
369 | _ -> (* other *)
370 ]}
371
372 Use {!Parse_error_code.to_string} to convert to a string representation.
373
374 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
375 WHATWG: Parse error codes *)
376val error_code : parse_error -> Parse_error_code.t
377
378(** Get the line number where the error occurred (1-indexed).
379
380 Line numbers count from 1 and increment at each newline character. *)
381val error_line : parse_error -> int
382
383(** Get the column number where the error occurred (1-indexed).
384
385 Column numbers count from 1 and reset at each newline. *)
386val error_column : parse_error -> int
387
388val pp_parse_error : Format.formatter -> parse_error -> unit
389(** Pretty-print a parse error with location information. *)
390
391(** {1 Error Handling} *)
392
393(** Global error type that wraps all errors raised by the Html5rw library.
394
395 This module provides a unified error type for all parsing and selector
396 errors, along with printers and conversion functions. Use this when you
397 want to handle all possible errors from the library in a uniform way.
398
399 {2 Usage}
400
401 {[
402 (* Converting parse errors *)
403 let errors = Html5rw.errors result in
404 List.iter (fun err ->
405 let unified = Html5rw.Error.of_parse_error err in
406 Printf.eprintf "%s\n" (Html5rw.Error.to_string unified)
407 ) errors
408
409 (* Catching selector errors *)
410 match Html5rw.query result selector with
411 | nodes -> (* success *)
412 | exception Html5rw.Selector.Selector_error code ->
413 let unified = Html5rw.Error.of_selector_error code in
414 Printf.eprintf "%s\n" (Html5rw.Error.to_string unified)
415 ]}
416*)
417module Error : sig
418 (** The unified error type for the Html5rw library. *)
419 type t =
420 | Parse_error of {
421 code : Parse_error_code.t;
422 line : int;
423 column : int;
424 }
425 (** An HTML parse error, including location information.
426
427 Parse errors occur during HTML tokenization and tree construction.
428 The location indicates where in the input the error was detected.
429
430 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
431 WHATWG: Parse errors *)
432
433 | Selector_error of Selector.Error_code.t
434 (** A CSS selector parse error.
435
436 Selector errors occur when parsing malformed CSS selectors passed
437 to {!query} or {!matches}. *)
438
439 val of_parse_error : parse_error -> t
440 (** Convert a parse error to the unified error type.
441
442 {[
443 let errors = Html5rw.errors result in
444 let unified_errors = List.map Html5rw.Error.of_parse_error errors
445 ]} *)
446
447 val of_selector_error : Selector.Error_code.t -> t
448 (** Convert a selector error code to the unified error type.
449
450 {[
451 match Html5rw.query result "invalid[" with
452 | _ -> ()
453 | exception Html5rw.Selector.Selector_error code ->
454 let err = Html5rw.Error.of_selector_error code in
455 Printf.eprintf "%s\n" (Html5rw.Error.to_string err)
456 ]} *)
457
458 val to_string : t -> string
459 (** Convert to a human-readable error message with location information.
460
461 Examples:
462 - ["Parse error at 5:12: unexpected-null-character"]
463 - ["Selector error: Expected \]"] *)
464
465 val pp : Format.formatter -> t -> unit
466 (** Pretty-printer for use with [Format] functions. *)
467
468 val code_string : t -> string
469 (** Get just the error code as a kebab-case string (without location).
470
471 This is useful for programmatic error handling or logging.
472
473 Examples:
474 - ["unexpected-null-character"]
475 - ["expected-closing-bracket"] *)
476end
477
478(** {1 Fragment Parsing} *)
479
480(** Context element for HTML fragment parsing (innerHTML).
481
482 When parsing HTML fragments (like the [innerHTML] of an element), you
483 must specify what element would contain the fragment. This affects how
484 the parser handles certain elements.
485
486 {b Why context matters:}
487
488 HTML parsing rules depend on where content appears. For example:
489 - [<td>] is valid inside [<tr>] but not inside [<div>]
490 - [<li>] is valid inside [<ul>] but creates implied lists elsewhere
491 - Content inside [<table>] has special parsing rules
492
493 {b Example:}
494 {[
495 (* Parse as if content were inside a <ul> *)
496 let ctx = make_fragment_context ~tag_name:"ul" () in
497 let result = parse ~fragment_context:ctx reader
498 (* Now <li> elements are parsed correctly *)
499 ]}
500
501 @see <https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments>
502 WHATWG: The fragment parsing algorithm *)
503type fragment_context = Parser.fragment_context
504
505(** Create a fragment parsing context.
506
507 The context element determines how the parser interprets the fragment.
508 Choose a context that matches where the fragment would be inserted.
509
510 @param tag_name Tag name of the context element (e.g., ["div"], ["tr"],
511 ["ul"]). This is the element that would contain the fragment.
512 @param namespace Namespace of the context element:
513 - [None] (default): HTML namespace
514 - [Some "svg"]: SVG namespace
515 - [Some "mathml"]: MathML namespace
516
517 {b Examples:}
518 {[
519 (* Parse as innerHTML of a <div> (most common case) *)
520 let ctx = make_fragment_context ~tag_name:"div" ()
521
522 (* Parse as innerHTML of a <ul> - <li> elements work correctly *)
523 let ctx = make_fragment_context ~tag_name:"ul" ()
524
525 (* Parse as innerHTML of an SVG <g> element *)
526 let ctx = make_fragment_context ~tag_name:"g" ~namespace:(Some "svg") ()
527
528 (* Parse as innerHTML of a <table> - table-specific rules apply *)
529 let ctx = make_fragment_context ~tag_name:"table" ()
530 ]}
531
532 @see <https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments>
533 WHATWG: Fragment parsing algorithm *)
534val make_fragment_context : tag_name:string -> ?namespace:string option ->
535 unit -> fragment_context
536
537(** Get the tag name of a fragment context. *)
538val fragment_context_tag : fragment_context -> string
539
540(** Get the namespace of a fragment context. *)
541val fragment_context_namespace : fragment_context -> string option
542
543val pp_fragment_context : Format.formatter -> fragment_context -> unit
544(** Pretty-print a fragment context. *)
545
546(** Result of parsing an HTML document.
547
548 This record contains everything produced by parsing:
549 - The DOM tree (accessible via {!val-root})
550 - Any parse errors (accessible via {!val-errors})
551 - The detected encoding (accessible via {!val-encoding})
552*)
553type t = {
554 root : node;
555 (** Root node of the parsed document tree.
556
557 For full document parsing, this is a Document node containing the
558 DOCTYPE (if any) and [<html>] element.
559
560 For fragment parsing, this is a Document Fragment containing the
561 parsed elements. *)
562
563 errors : parse_error list;
564 (** Parse errors encountered during parsing.
565
566 This list is empty unless [~collect_errors:true] was passed to the
567 parse function. Errors are in the order they were encountered.
568
569 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
570 WHATWG: Parse errors *)
571
572 encoding : encoding option;
573 (** Character encoding detected during parsing.
574
575 This is [Some encoding] when using {!parse_bytes} with automatic
576 encoding detection, and [None] when using {!parse} (which expects
577 pre-decoded UTF-8 input). *)
578}
579
580val pp : Format.formatter -> t -> unit
581(** Pretty-print a parse result summary. *)
582
583(** {1 Parsing Functions} *)
584
585(** Parse HTML from a [Bytes.Reader.t].
586
587 This is the primary parsing function. It reads bytes from the provided
588 reader and returns a DOM tree. The input should be valid UTF-8.
589
590 {b Creating readers:}
591 {[
592 open Bytesrw
593
594 (* From a string *)
595 let reader = Bytes.Reader.of_string html_string
596
597 (* From a file *)
598 let ic = open_in "page.html" in
599 let reader = Bytes.Reader.of_in_channel ic
600
601 (* From a buffer *)
602 let reader = Bytes.Reader.of_buffer buf
603 ]}
604
605 {b Parsing a complete document:}
606 {[
607 let result = Html5rw.parse reader
608 let doc = Html5rw.root result
609 ]}
610
611 {b Parsing a fragment:}
612 {[
613 let ctx = Html5rw.make_fragment_context ~tag_name:"div" () in
614 let result = Html5rw.parse ~fragment_context:ctx reader
615 ]}
616
617 @param collect_errors If [true], collect parse errors. Default: [false].
618 Error collection has some performance overhead.
619 @param fragment_context Context element for fragment parsing. If provided,
620 the input is parsed as a fragment (like innerHTML) rather than
621 a complete document.
622
623 @see <https://html.spec.whatwg.org/multipage/parsing.html>
624 WHATWG: HTML parsing algorithm *)
625val parse : ?collect_errors:bool -> ?fragment_context:fragment_context ->
626 Bytesrw.Bytes.Reader.t -> t
627
628(** Parse raw bytes with automatic encoding detection.
629
630 This function is useful when you have raw bytes and don't know the
631 character encoding. It implements the WHATWG encoding sniffing algorithm:
632
633 1. {b BOM detection}: Check for UTF-8, UTF-16LE, or UTF-16BE BOM
634 2. {b Prescan}: Look for [<meta charset="...">] in the first 1024 bytes
635 3. {b Transport hint}: Use the provided [transport_encoding] if any
636 4. {b Fallback}: Use UTF-8 (the modern web default)
637
638 The detected encoding is stored in the result's [encoding] field.
639
640 {b Example:}
641 {[
642 let bytes = really_input_bytes ic (in_channel_length ic) in
643 let result = Html5rw.parse_bytes bytes in
644 match Html5rw.encoding result with
645 | Some Utf8 -> print_endline "UTF-8 detected"
646 | Some Windows_1252 -> print_endline "Windows-1252 detected"
647 | _ -> ()
648 ]}
649
650 @param collect_errors If [true], collect parse errors. Default: [false].
651 @param transport_encoding Encoding hint from HTTP Content-Type header.
652 For example, if the server sends [Content-Type: text/html; charset=utf-8],
653 pass [~transport_encoding:"utf-8"].
654 @param fragment_context Context element for fragment parsing.
655
656 @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
657 WHATWG: Determining the character encoding *)
658val parse_bytes : ?collect_errors:bool -> ?transport_encoding:string ->
659 ?fragment_context:fragment_context -> bytes -> t
660
661(** {1 Querying} *)
662
663(** Query the DOM tree with a CSS selector.
664
665 CSS selectors are patterns used to select elements in HTML documents.
666 This function returns all nodes matching the selector, in document order.
667
668 {b Supported selectors:}
669
670 {i Type selectors:}
671 - [div], [p], [span] - elements by tag name
672
673 {i Class and ID selectors:}
674 - [#myid] - element with [id="myid"]
675 - [.myclass] - elements with class containing "myclass"
676
677 {i Attribute selectors:}
678 - [[attr]] - elements with the [attr] attribute
679 - [[attr="value"]] - attribute equals value
680 - [[attr~="value"]] - attribute contains word
681 - [[attr|="value"]] - attribute starts with value or value-
682 - [[attr^="value"]] - attribute starts with value
683 - [[attr$="value"]] - attribute ends with value
684 - [[attr*="value"]] - attribute contains value
685
686 {i Pseudo-classes:}
687 - [:first-child], [:last-child] - first/last child of parent
688 - [:nth-child(n)] - nth child (1-indexed)
689 - [:only-child] - only child of parent
690 - [:empty] - elements with no children
691 - [:not(selector)] - elements not matching selector
692
693 {i Combinators:}
694 - [A B] - B descendants of A (any depth)
695 - [A > B] - B direct children of A
696 - [A + B] - B immediately after A (adjacent sibling)
697 - [A ~ B] - B after A (general sibling)
698
699 {i Universal:}
700 - [*] - all elements
701
702 {b Examples:}
703 {[
704 (* All paragraphs *)
705 let ps = query result "p"
706
707 (* Elements with class "warning" inside a div *)
708 let warnings = query result "div .warning"
709
710 (* Direct children of nav that are links *)
711 let nav_links = query result "nav > a"
712
713 (* Complex selector *)
714 let items = query result "ul.menu > li:first-child a[href]"
715 ]}
716
717 @raise Selector.Selector_error if the selector syntax is invalid
718
719 @see <https://www.w3.org/TR/selectors-4/>
720 W3C: Selectors Level 4 *)
721val query : t -> string -> node list
722
723(** Check if a node matches a CSS selector.
724
725 This is useful for filtering nodes or implementing custom traversals.
726
727 {b Example:}
728 {[
729 let is_external_link node =
730 matches node "a[href^='http']"
731 ]}
732
733 @raise Selector.Selector_error if the selector syntax is invalid *)
734val matches : node -> string -> bool
735
736(** {1 Serialization} *)
737
738(** Write the DOM tree to a [Bytes.Writer.t].
739
740 This serializes the DOM back to HTML. The output is valid HTML5 that
741 can be parsed to produce an equivalent DOM tree.
742
743 {b Example:}
744 {[
745 open Bytesrw
746 let buf = Buffer.create 1024 in
747 let writer = Bytes.Writer.of_buffer buf in
748 Html5rw.to_writer result writer;
749 Bytes.Writer.write_eod writer;
750 let html = Buffer.contents buf
751 ]}
752
753 @param pretty If [true] (default), add indentation for readability.
754 If [false], output compact HTML with no added whitespace.
755 @param indent_size Spaces per indentation level (default: 2).
756 Only used when [pretty] is [true].
757
758 @see <https://html.spec.whatwg.org/multipage/parsing.html#serialising-html-fragments>
759 WHATWG: Serialising HTML fragments *)
760val to_writer : ?pretty:bool -> ?indent_size:int -> t ->
761 Bytesrw.Bytes.Writer.t -> unit
762
763(** Serialize the DOM tree to a string.
764
765 Convenience function that serializes to a string instead of a writer.
766 Use {!to_writer} for large documents to avoid memory allocation.
767
768 @param pretty If [true] (default), add indentation for readability.
769 @param indent_size Spaces per indentation level (default: 2). *)
770val to_string : ?pretty:bool -> ?indent_size:int -> t -> string
771
772(** Extract text content from the DOM tree.
773
774 This concatenates all text nodes in the document, producing a string
775 with just the readable text (no HTML tags).
776
777 {b Example:}
778 {[
779 (* For document: <div><p>Hello</p><p>World</p></div> *)
780 let text = to_text result
781 (* Returns: "Hello World" *)
782 ]}
783
784 @param separator String to insert between text nodes (default: [" "])
785 @param strip If [true] (default), trim leading/trailing whitespace *)
786val to_text : ?separator:string -> ?strip:bool -> t -> string
787
788(** Serialize to html5lib test format.
789
790 This produces the tree format used by the
791 {{:https://github.com/html5lib/html5lib-tests} html5lib-tests} suite.
792 Mainly useful for testing the parser against the reference tests. *)
793val to_test_format : t -> string
794
795(** {1 Result Accessors} *)
796
797(** Get the root node of the parsed document.
798
799 For full document parsing, this returns a Document node. The structure is:
800 {v
801 #document
802 ├── !doctype (if present)
803 └── html
804 ├── head
805 └── body
806 v}
807
808 For fragment parsing, this returns a Document Fragment node containing
809 the parsed elements directly. *)
810val root : t -> node
811
812(** Get parse errors (if error collection was enabled).
813
814 Returns an empty list if [~collect_errors:true] was not passed to the
815 parse function, or if the document was well-formed.
816
817 Errors are returned in the order they were encountered during parsing.
818
819 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
820 WHATWG: Parse errors *)
821val errors : t -> parse_error list
822
823(** Get the detected encoding (if parsed from bytes).
824
825 Returns [Some encoding] when {!parse_bytes} was used, indicating which
826 encoding was detected or specified. Returns [None] when {!parse} was
827 used, since it expects pre-decoded UTF-8 input.
828
829 @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
830 WHATWG: Determining the character encoding *)
831val encoding : t -> encoding option
832
833(** {1 DOM Utilities}
834
835 Common DOM operations are available directly on this module. For the
836 full API including more advanced operations, see the {!Dom} module.
837
838 @see <https://html.spec.whatwg.org/multipage/dom.html>
839 WHATWG: The elements of HTML
840*)
841
842(** Create an element node.
843
844 Elements are the building blocks of HTML documents. They represent tags
845 like [<div>], [<p>], [<a>], etc.
846
847 @param name Tag name (e.g., ["div"], ["p"], ["span"])
848 @param namespace Element namespace:
849 - [None] (default): HTML namespace
850 - [Some "svg"]: SVG namespace for graphics
851 - [Some "mathml"]: MathML namespace for math notation
852 @param attrs Initial attributes as [(name, value)] pairs
853
854 {b Example:}
855 {[
856 (* Simple element *)
857 let div = create_element "div" ()
858
859 (* Element with attributes *)
860 let link = create_element "a"
861 ~attrs:[("href", "/about"); ("class", "nav-link")]
862 ()
863 ]}
864
865 @see <https://html.spec.whatwg.org/multipage/dom.html#elements-in-the-dom>
866 WHATWG: Elements in the DOM *)
867val create_element : string -> ?namespace:string option ->
868 ?attrs:(string * string) list -> unit -> node
869
870(** Create a text node.
871
872 Text nodes contain the readable text content of HTML documents.
873
874 {b Example:}
875 {[
876 let text = create_text "Hello, world!"
877 ]} *)
878val create_text : string -> node
879
880(** Create a comment node.
881
882 Comments are preserved in the DOM but not rendered. They're written
883 as [<!-- text -->] in HTML.
884
885 @see <https://html.spec.whatwg.org/multipage/syntax.html#comments>
886 WHATWG: Comments *)
887val create_comment : string -> node
888
889(** Create an empty document node.
890
891 The Document node is the root of an HTML document tree.
892
893 @see <https://html.spec.whatwg.org/multipage/dom.html#document>
894 WHATWG: The Document object *)
895val create_document : unit -> node
896
897(** Create a document fragment node.
898
899 Document fragments are lightweight containers for holding nodes
900 without a parent document. Used for template contents and fragment
901 parsing results.
902
903 @see <https://dom.spec.whatwg.org/#documentfragment>
904 DOM Standard: DocumentFragment *)
905val create_document_fragment : unit -> node
906
907(** Create a doctype node.
908
909 For HTML5 documents, use [create_doctype ~name:"html" ()].
910
911 @param name DOCTYPE name (usually ["html"])
912 @param public_id Public identifier (legacy)
913 @param system_id System identifier (legacy)
914
915 @see <https://html.spec.whatwg.org/multipage/syntax.html#the-doctype>
916 WHATWG: The DOCTYPE *)
917val create_doctype : ?name:string -> ?public_id:string ->
918 ?system_id:string -> unit -> node
919
920(** Append a child node to a parent.
921
922 The child is added as the last child of the parent. If the child
923 already has a parent, it is first removed from that parent. *)
924val append_child : node -> node -> unit
925
926(** Insert a node before a reference node.
927
928 @param parent The parent node
929 @param new_child The node to insert
930 @param ref_child The existing child to insert before
931
932 Raises [Not_found] if [ref_child] is not a child of [parent]. *)
933val insert_before : node -> node -> node -> unit
934
935(** Remove a child node from its parent.
936
937 Raises [Not_found] if [child] is not a child of [parent]. *)
938val remove_child : node -> node -> unit
939
940(** Get an attribute value.
941
942 Returns [Some value] if the attribute exists, [None] otherwise.
943 Attribute names are case-sensitive (but were lowercased during parsing).
944
945 @see <https://html.spec.whatwg.org/multipage/dom.html#attributes>
946 WHATWG: Attributes *)
947val get_attr : node -> string -> string option
948
949(** Set an attribute value.
950
951 If the attribute exists, it is replaced. If not, it is added. *)
952val set_attr : node -> string -> string -> unit
953
954(** Check if a node has an attribute. *)
955val has_attr : node -> string -> bool
956
957(** Get all descendant nodes in document order.
958
959 Returns all nodes below this node in the tree, in the order they
960 appear in the HTML source (depth-first). *)
961val descendants : node -> node list
962
963(** Get all ancestor nodes from parent to root.
964
965 Returns the chain of parent nodes, starting with the immediate parent
966 and ending with the Document node. *)
967val ancestors : node -> node list
968
969(** Get text content of a node and its descendants.
970
971 For text nodes, returns the text directly. For elements, recursively
972 concatenates all descendant text content. *)
973val get_text_content : node -> string
974
975(** Clone a node.
976
977 @param deep If [true], recursively clone all descendants.
978 If [false] (default), only clone the node itself. *)
979val clone : ?deep:bool -> node -> node
980
981(** {1 Node Predicates}
982
983 Functions to test what type of node you have.
984*)
985
986(** Test if a node is an element.
987
988 Elements are HTML tags like [<div>], [<p>], [<a>]. *)
989val is_element : node -> bool
990
991(** Test if a node is a text node.
992
993 Text nodes contain character content within elements. *)
994val is_text : node -> bool
995
996(** Test if a node is a comment node.
997
998 Comment nodes represent HTML comments [<!-- ... -->]. *)
999val is_comment : node -> bool
1000
1001(** Test if a node is a document node.
1002
1003 The document node is the root of a complete HTML document tree. *)
1004val is_document : node -> bool
1005
1006(** Test if a node is a document fragment.
1007
1008 Document fragments are lightweight containers for nodes. *)
1009val is_document_fragment : node -> bool
1010
1011(** Test if a node is a doctype node.
1012
1013 Doctype nodes represent the [<!DOCTYPE>] declaration. *)
1014val is_doctype : node -> bool
1015
1016(** Test if a node has children. *)
1017val has_children : node -> bool