lib/html5rw/html5rw.mli at main · anil.recoil.org/ocaml-html5rw

OCaml HTML5 parser/serialiser based on Python's JustHTML
ocaml-html5rw / lib / html5rw / html5rw.mli
at main 36 kB view raw
   1(*---------------------------------------------------------------------------
   2  Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
   3  SPDX-License-Identifier: MIT
   4 ---------------------------------------------------------------------------*)
   5
   6(** Html5rw - Pure OCaml HTML5 Parser
   7
   8    This library provides a complete HTML5 parsing solution that implements the
   9    {{:https://html.spec.whatwg.org/multipage/parsing.html} WHATWG HTML5
  10    parsing specification}. It can parse any HTML document - well-formed or not -
  11    and produce a DOM (Document Object Model) tree that matches browser behavior.
  12
  13    {2 What is HTML?}
  14
  15    HTML (HyperText Markup Language) is the standard markup language for creating
  16    web pages. An HTML document consists of nested {i elements} that describe
  17    the structure and content of the page:
  18
  19    {v
  20    <!DOCTYPE html>
  21    <html>
  22      <head>
  23        <title>My Page</title>
  24      </head>
  25      <body>
  26        <h1>Welcome</h1>
  27        <p>Hello, <b>world</b>!</p>
  28      </body>
  29    </html>
  30    v}
  31
  32    Each element is written with a {i start tag} (like [<p>]), content, and an
  33    {i end tag} (like [</p>]). Elements can have {i attributes} that provide
  34    additional information: [<a href="https://example.com">].
  35
  36    @see <https://html.spec.whatwg.org/multipage/introduction.html>
  37         WHATWG: Introduction to HTML
  38
  39    {2 The DOM}
  40
  41    When this parser processes HTML, it doesn't just store the text. Instead,
  42    it builds a tree structure called the DOM (Document Object Model). Each
  43    element, text fragment, and comment becomes a {i node} in this tree:
  44
  45    {v
  46    Document
  47    └── html
  48        ├── head
  49        │   └── title
  50        │       └── #text "My Page"
  51        └── body
  52            ├── h1
  53            │   └── #text "Welcome"
  54            └── p
  55                ├── #text "Hello, "
  56                ├── b
  57                │   └── #text "world"
  58                └── #text "!"
  59    v}
  60
  61    This tree can be traversed, searched, and modified. The {!Dom} module
  62    provides types and functions for working with DOM nodes.
  63
  64    @see <https://html.spec.whatwg.org/multipage/dom.html>
  65         WHATWG: The elements of HTML (DOM chapter)
  66
  67    {2 Quick Start}
  68
  69    Parse HTML from a string:
  70    {[
  71      open Bytesrw
  72      let reader = Bytes.Reader.of_string "<p>Hello, world!</p>" in
  73      let result = Html5rw.parse reader in
  74      let html = Html5rw.to_string result
  75    ]}
  76
  77    Parse from a file:
  78    {[
  79      open Bytesrw
  80      let ic = open_in "page.html" in
  81      let reader = Bytes.Reader.of_in_channel ic in
  82      let result = Html5rw.parse reader in
  83      close_in ic
  84    ]}
  85
  86    Query with CSS selectors:
  87    {[
  88      let result = Html5rw.parse reader in
  89      let divs = Html5rw.query result "div.content"
  90    ]}
  91
  92    {2 Error Handling}
  93
  94    Unlike many parsers, HTML5 parsing {b never fails}. The WHATWG specification
  95    defines error recovery rules for every possible malformed input, ensuring
  96    all HTML documents produce a valid DOM tree (just as browsers do).
  97
  98    For example, parsing [<p>Hello<p>World] produces two paragraphs, not an
  99    error, because [<p>] implicitly closes the previous [<p>].
 100
 101    If you need to detect malformed HTML (e.g., for validation), enable error
 102    collection with [~collect_errors:true]. Errors are advisory - the parsing
 103    still succeeds.
 104
 105    @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
 106         WHATWG: Parse errors
 107
 108    {2 HTML vs XHTML}
 109
 110    This parser implements {b HTML5 parsing}, not XHTML parsing. Key differences:
 111
 112    - Tag and attribute names are case-insensitive ([<DIV>] equals [<div>])
 113    - Some end tags are optional ([<p>Hello] is valid)
 114    - Void elements have no end tag ([<br>], not [<br/>] or [<br></br>])
 115    - Boolean attributes need no value ([<input disabled>])
 116
 117    XHTML uses stricter XML rules. If you need XHTML parsing, use an XML parser.
 118
 119    @see <https://html.spec.whatwg.org/multipage/syntax.html>
 120         WHATWG: The HTML syntax
 121*)
 122
 123(** {1 Sub-modules} *)
 124
 125(** Parse error code types.
 126
 127    This module provides the {!Parse_error_code.t} variant type that represents
 128    all WHATWG-defined parse errors plus tree construction errors.
 129
 130    @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
 131         WHATWG: Parse errors *)
 132module Parse_error_code = Parse_error_code
 133
 134(** DOM types and manipulation functions.
 135
 136    This module provides the core types for representing HTML documents as
 137    DOM trees. It includes:
 138    - The {!Dom.node} type representing all kinds of DOM nodes
 139    - Functions to create, modify, and traverse nodes
 140    - Serialization functions to convert DOM back to HTML
 141
 142    @see <https://html.spec.whatwg.org/multipage/dom.html>
 143         WHATWG: The elements of HTML *)
 144module Dom = Dom
 145
 146(** HTML5 tokenizer.
 147
 148    The tokenizer is the first stage of HTML5 parsing. It converts a stream
 149    of characters into a stream of {i tokens}: start tags, end tags, text,
 150    comments, and DOCTYPEs.
 151
 152    Most users don't need to use the tokenizer directly - the {!parse}
 153    function handles everything. The tokenizer is exposed for advanced use
 154    cases like syntax highlighting or partial parsing.
 155
 156    @see <https://html.spec.whatwg.org/multipage/parsing.html#tokenization>
 157         WHATWG: Tokenization *)
 158module Tokenizer = Tokenizer
 159
 160(** Encoding detection and decoding.
 161
 162    HTML documents can use various character encodings (UTF-8, ISO-8859-1,
 163    etc.). This module implements the WHATWG encoding sniffing algorithm
 164    that browsers use to detect the encoding of a document:
 165
 166    1. Check for a BOM (Byte Order Mark)
 167    2. Look for a [<meta charset>] declaration
 168    3. Use HTTP Content-Type header hint (if available)
 169    4. Fall back to UTF-8
 170
 171    @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
 172         WHATWG: Determining the character encoding
 173    @see <https://encoding.spec.whatwg.org/>
 174         WHATWG Encoding Standard *)
 175module Encoding = Encoding
 176
 177(** CSS selector engine.
 178
 179    This module provides CSS selector support for querying the DOM tree.
 180    CSS selectors are patterns used to select HTML elements based on their
 181    tag names, attributes, classes, IDs, and position in the document.
 182
 183    Example selectors:
 184    - [div] - all [<div>] elements
 185    - [#header] - element with [id="header"]
 186    - [.warning] - elements with [class="warning"]
 187    - [div > p] - [<p>] elements that are direct children of [<div>]
 188    - [[href]] - elements with an [href] attribute
 189
 190    @see <https://www.w3.org/TR/selectors-4/>
 191         W3C Selectors Level 4 specification *)
 192module Selector = Selector
 193
 194(** HTML entity decoding.
 195
 196    HTML uses {i character references} to represent characters that are
 197    hard to type or have special meaning:
 198
 199    - Named references: [&amp;] (ampersand), [&lt;] (less than), [&nbsp;] (non-breaking space)
 200    - Decimal references: [&#60;] (less than as decimal 60)
 201    - Hexadecimal references: [&#x3C;] (less than as hex 3C)
 202
 203    This module decodes all 2,231 named character references defined in
 204    the WHATWG specification, plus numeric references.
 205
 206    @see <https://html.spec.whatwg.org/multipage/named-characters.html>
 207         WHATWG: Named character references *)
 208module Entities = Entities
 209
 210(** Low-level parser access.
 211
 212    This module exposes the internals of the HTML5 parser for advanced use.
 213    Most users should use the top-level {!parse} function instead.
 214
 215    The parser exposes:
 216    - Insertion modes for the tree construction algorithm
 217    - The tree builder state machine
 218    - Lower-level parsing functions
 219
 220    @see <https://html.spec.whatwg.org/multipage/parsing.html#tree-construction>
 221         WHATWG: Tree construction *)
 222module Parser = Parser
 223
 224(** {1 Core Types} *)
 225
 226(** DOM node type.
 227
 228    A node represents one part of an HTML document. Nodes form a tree
 229    structure with parent/child relationships. There are several kinds:
 230
 231    - {b Element nodes}: HTML tags like [<div>], [<p>], [<a>]
 232    - {b Text nodes}: Text content within elements
 233    - {b Comment nodes}: HTML comments [<!-- ... -->]
 234    - {b Document nodes}: The root of a document tree
 235    - {b Document fragment nodes}: Lightweight containers
 236    - {b Doctype nodes}: The [<!DOCTYPE html>] declaration
 237
 238    See {!Dom} for manipulation functions.
 239
 240    @see <https://html.spec.whatwg.org/multipage/dom.html>
 241         WHATWG: The DOM *)
 242type node = Dom.node
 243
 244val pp_node : Format.formatter -> node -> unit
 245(** Pretty-print a DOM node. Prints a summary representation showing the
 246    node type and key attributes. Does not recursively print children. *)
 247
 248(** DOCTYPE information.
 249
 250    The DOCTYPE declaration ([<!DOCTYPE html>]) appears at the start of HTML
 251    documents. It tells browsers to use standards mode for rendering.
 252
 253    In HTML5, the DOCTYPE is minimal - just [<!DOCTYPE html>] with no public
 254    or system identifiers. Legacy DOCTYPEs may have additional fields.
 255
 256    @see <https://html.spec.whatwg.org/multipage/syntax.html#the-doctype>
 257         WHATWG: The DOCTYPE *)
 258type doctype_data = Dom.doctype_data = {
 259  name : string option;
 260  (** DOCTYPE name, typically ["html"] *)
 261
 262  public_id : string option;
 263  (** Public identifier for legacy DOCTYPEs (e.g., XHTML, HTML4) *)
 264
 265  system_id : string option;
 266  (** System identifier (URL) for legacy DOCTYPEs *)
 267}
 268
 269val pp_doctype_data : Format.formatter -> doctype_data -> unit
 270(** Pretty-print DOCTYPE data. *)
 271
 272(** Source location for nodes.
 273
 274    Records the line and column where a node was found in the source HTML.
 275    The end position is optional for nodes like text that may span multiple
 276    locations. *)
 277type location = Dom.location = {
 278  line : int;
 279  (** 1-indexed line number where the node starts *)
 280
 281  column : int;
 282  (** 1-indexed column number where the node starts *)
 283
 284  end_line : int option;
 285  (** Optional line number where the node ends *)
 286
 287  end_column : int option;
 288  (** Optional column number where the node ends *)
 289}
 290
 291val make_location : line:int -> column:int -> ?end_line:int -> ?end_column:int -> unit -> location
 292(** Create a location. *)
 293
 294val get_location : node -> location option
 295(** Get the source location for a node, if set. *)
 296
 297val set_location : node -> line:int -> column:int -> ?end_line:int -> ?end_column:int -> unit -> unit
 298(** Set the source location for a node. *)
 299
 300(** Quirks mode as determined during parsing.
 301
 302    {i Quirks mode} controls how browsers render CSS and compute layouts.
 303    It exists for backwards compatibility with old web pages that relied
 304    on browser bugs.
 305
 306    - {b No_quirks}: Standards mode. The document is rendered according to
 307      modern HTML5 and CSS specifications. Triggered by [<!DOCTYPE html>].
 308
 309    - {b Quirks}: Full quirks mode. The browser emulates bugs from older
 310      browsers (primarily IE5). Triggered by missing or malformed DOCTYPEs.
 311      Affects CSS box model, table layout, font inheritance, and more.
 312
 313    - {b Limited_quirks}: Almost standards mode. Only a few specific quirks
 314      are applied, mainly affecting table cell vertical alignment.
 315
 316    {b Recommendation:} Always use [<!DOCTYPE html>] to ensure standards mode.
 317
 318    @see <https://quirks.spec.whatwg.org/>
 319         Quirks Mode Standard
 320    @see <https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode>
 321         WHATWG: How quirks mode is determined *)
 322type quirks_mode = Dom.quirks_mode = No_quirks | Quirks | Limited_quirks
 323
 324val pp_quirks_mode : Format.formatter -> quirks_mode -> unit
 325(** Pretty-print quirks mode. *)
 326
 327(** Character encoding detected or specified.
 328
 329    HTML documents are sequences of bytes that must be decoded into characters.
 330    Different encodings interpret the same bytes differently. For example:
 331
 332    - UTF-8: The modern standard, supporting all Unicode characters
 333    - Windows-1252: Common on older Western European web pages
 334    - ISO-8859-2: Used for Central European languages
 335    - UTF-16: Used by some Windows applications
 336
 337    The parser detects encoding automatically when using {!parse_bytes}.
 338    The detected encoding is available via {!val-encoding}.
 339
 340    @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
 341         WHATWG: Determining the character encoding
 342    @see <https://encoding.spec.whatwg.org/>
 343         WHATWG Encoding Standard *)
 344type encoding = Encoding.encoding =
 345  | Utf8
 346  (** UTF-8: The dominant encoding for the web, supporting all Unicode *)
 347
 348  | Utf16le
 349  (** UTF-16 Little-Endian: 16-bit encoding, used by Windows *)
 350
 351  | Utf16be
 352  (** UTF-16 Big-Endian: 16-bit encoding, network byte order *)
 353
 354  | Windows_1252
 355  (** Windows-1252 (CP-1252): Western European, superset of ISO-8859-1 *)
 356
 357  | Iso_8859_2
 358  (** ISO-8859-2: Central European (Polish, Czech, Hungarian, etc.) *)
 359
 360  | Euc_jp
 361  (** EUC-JP: Extended Unix Code for Japanese *)
 362
 363val pp_encoding : Format.formatter -> encoding -> unit
 364(** Pretty-print an encoding using its canonical label. *)
 365
 366(** A parse error encountered during HTML5 parsing.
 367
 368    HTML5 parsing {b never fails} - the specification defines error recovery
 369    for all malformed input. However, conformance checkers can report these
 370    errors. Enable error collection with [~collect_errors:true] if you want
 371    to detect malformed HTML.
 372
 373    {b Common parse errors:}
 374
 375    - ["unexpected-null-character"]: Null byte in the input
 376    - ["eof-before-tag-name"]: File ended while reading a tag
 377    - ["unexpected-character-in-attribute-name"]: Invalid attribute syntax
 378    - ["missing-doctype"]: Document started without [<!DOCTYPE>]
 379    - ["duplicate-attribute"]: Same attribute appears twice on an element
 380
 381    The full list of parse error codes is defined in the WHATWG specification.
 382
 383    @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
 384         WHATWG: Complete list of parse errors *)
 385type parse_error = Parser.parse_error
 386
 387(** Get the error code.
 388
 389    Returns the {!Parse_error_code.t} variant representing this error.
 390    This allows pattern matching on specific error types:
 391
 392    {[
 393      match Html5rw.error_code err with
 394      | Parse_error_code.Unexpected_null_character -> (* handle *)
 395      | Parse_error_code.Eof_in_tag -> (* handle *)
 396      | Parse_error_code.Tree_construction_error msg -> (* handle tree error *)
 397      | _ -> (* other *)
 398    ]}
 399
 400    Use {!Parse_error_code.to_string} to convert to a string representation.
 401
 402    @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
 403         WHATWG: Parse error codes *)
 404val error_code : parse_error -> Parse_error_code.t
 405
 406(** Get the line number where the error occurred (1-indexed).
 407
 408    Line numbers count from 1 and increment at each newline character. *)
 409val error_line : parse_error -> int
 410
 411(** Get the column number where the error occurred (1-indexed).
 412
 413    Column numbers count from 1 and reset at each newline. *)
 414val error_column : parse_error -> int
 415
 416val pp_parse_error : Format.formatter -> parse_error -> unit
 417(** Pretty-print a parse error with location information. *)
 418
 419(** {1 Error Handling} *)
 420
 421(** Global error type that wraps all errors raised by the Html5rw library.
 422
 423    This module provides a unified error type for all parsing and selector
 424    errors, along with printers and conversion functions. Use this when you
 425    want to handle all possible errors from the library in a uniform way.
 426
 427    {2 Usage}
 428
 429    {[
 430      (* Converting parse errors *)
 431      let errors = Html5rw.errors result in
 432      List.iter (fun err ->
 433        let unified = Html5rw.Error.of_parse_error err in
 434        Printf.eprintf "%s\n" (Html5rw.Error.to_string unified)
 435      ) errors
 436
 437      (* Catching selector errors *)
 438      match Html5rw.query result selector with
 439      | nodes -> (* success *)
 440      | exception Html5rw.Selector.Selector_error code ->
 441        let unified = Html5rw.Error.of_selector_error code in
 442        Printf.eprintf "%s\n" (Html5rw.Error.to_string unified)
 443    ]}
 444*)
 445module Error : sig
 446  (** The unified error type for the Html5rw library. *)
 447  type t =
 448    | Parse_error of {
 449        code : Parse_error_code.t;
 450        line : int;
 451        column : int;
 452      }
 453        (** An HTML parse error, including location information.
 454
 455            Parse errors occur during HTML tokenization and tree construction.
 456            The location indicates where in the input the error was detected.
 457
 458            @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
 459                 WHATWG: Parse errors *)
 460
 461    | Selector_error of Selector.Error_code.t
 462        (** A CSS selector parse error.
 463
 464            Selector errors occur when parsing malformed CSS selectors passed
 465            to {!query} or {!matches}. *)
 466
 467  val of_parse_error : parse_error -> t
 468  (** Convert a parse error to the unified error type.
 469
 470      {[
 471        let errors = Html5rw.errors result in
 472        let unified_errors = List.map Html5rw.Error.of_parse_error errors
 473      ]} *)
 474
 475  val of_selector_error : Selector.Error_code.t -> t
 476  (** Convert a selector error code to the unified error type.
 477
 478      {[
 479        match Html5rw.query result "invalid[" with
 480        | _ -> ()
 481        | exception Html5rw.Selector.Selector_error code ->
 482          let err = Html5rw.Error.of_selector_error code in
 483          Printf.eprintf "%s\n" (Html5rw.Error.to_string err)
 484      ]} *)
 485
 486  val to_string : t -> string
 487  (** Convert to a human-readable error message with location information.
 488
 489      Examples:
 490      - ["Parse error at 5:12: unexpected-null-character"]
 491      - ["Selector error: Expected \]"] *)
 492
 493  val pp : Format.formatter -> t -> unit
 494  (** Pretty-printer for use with [Format] functions. *)
 495
 496  val code_string : t -> string
 497  (** Get just the error code as a kebab-case string (without location).
 498
 499      This is useful for programmatic error handling or logging.
 500
 501      Examples:
 502      - ["unexpected-null-character"]
 503      - ["expected-closing-bracket"] *)
 504end
 505
 506(** {1 Fragment Parsing} *)
 507
 508(** Context element for HTML fragment parsing (innerHTML).
 509
 510    When parsing HTML fragments (like the [innerHTML] of an element), you
 511    must specify what element would contain the fragment. This affects how
 512    the parser handles certain elements.
 513
 514    {b Why context matters:}
 515
 516    HTML parsing rules depend on where content appears. For example:
 517    - [<td>] is valid inside [<tr>] but not inside [<div>]
 518    - [<li>] is valid inside [<ul>] but creates implied lists elsewhere
 519    - Content inside [<table>] has special parsing rules
 520
 521    {b Example:}
 522    {[
 523      (* Parse as if content were inside a <ul> *)
 524      let ctx = make_fragment_context ~tag_name:"ul" () in
 525      let result = parse ~fragment_context:ctx reader
 526      (* Now <li> elements are parsed correctly *)
 527    ]}
 528
 529    @see <https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments>
 530         WHATWG: The fragment parsing algorithm *)
 531type fragment_context = Parser.fragment_context
 532
 533(** Create a fragment parsing context.
 534
 535    The context element determines how the parser interprets the fragment.
 536    Choose a context that matches where the fragment would be inserted.
 537
 538    @param tag_name Tag name of the context element (e.g., ["div"], ["tr"],
 539           ["ul"]). This is the element that would contain the fragment.
 540    @param namespace Namespace of the context element:
 541           - [None] (default): HTML namespace
 542           - [Some "svg"]: SVG namespace
 543           - [Some "mathml"]: MathML namespace
 544
 545    {b Examples:}
 546    {[
 547      (* Parse as innerHTML of a <div> (most common case) *)
 548      let ctx = make_fragment_context ~tag_name:"div" ()
 549
 550      (* Parse as innerHTML of a <ul> - <li> elements work correctly *)
 551      let ctx = make_fragment_context ~tag_name:"ul" ()
 552
 553      (* Parse as innerHTML of an SVG <g> element *)
 554      let ctx = make_fragment_context ~tag_name:"g" ~namespace:(Some "svg") ()
 555
 556      (* Parse as innerHTML of a <table> - table-specific rules apply *)
 557      let ctx = make_fragment_context ~tag_name:"table" ()
 558    ]}
 559
 560    @see <https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments>
 561         WHATWG: Fragment parsing algorithm *)
 562val make_fragment_context : tag_name:string -> ?namespace:string option ->
 563  unit -> fragment_context
 564
 565(** Get the tag name of a fragment context. *)
 566val fragment_context_tag : fragment_context -> string
 567
 568(** Get the namespace of a fragment context. *)
 569val fragment_context_namespace : fragment_context -> string option
 570
 571val pp_fragment_context : Format.formatter -> fragment_context -> unit
 572(** Pretty-print a fragment context. *)
 573
 574(** Result of parsing an HTML document.
 575
 576    This record contains everything produced by parsing:
 577    - The DOM tree (accessible via {!val-root})
 578    - Any parse errors (accessible via {!val-errors})
 579    - The detected encoding (accessible via {!val-encoding})
 580*)
 581type t = {
 582  root : node;
 583  (** Root node of the parsed document tree.
 584
 585      For full document parsing, this is a Document node containing the
 586      DOCTYPE (if any) and [<html>] element.
 587
 588      For fragment parsing, this is a Document Fragment containing the
 589      parsed elements. *)
 590
 591  errors : parse_error list;
 592  (** Parse errors encountered during parsing.
 593
 594      This list is empty unless [~collect_errors:true] was passed to the
 595      parse function. Errors are in the order they were encountered.
 596
 597      @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
 598           WHATWG: Parse errors *)
 599
 600  encoding : encoding option;
 601  (** Character encoding detected during parsing.
 602
 603      This is [Some encoding] when using {!parse_bytes} with automatic
 604      encoding detection, and [None] when using {!parse} (which expects
 605      pre-decoded UTF-8 input). *)
 606}
 607
 608val pp : Format.formatter -> t -> unit
 609(** Pretty-print a parse result summary. *)
 610
 611(** {1 Parsing Functions} *)
 612
 613(** Parse HTML from a [Bytes.Reader.t].
 614
 615    This is the primary parsing function. It reads bytes from the provided
 616    reader and returns a DOM tree. The input should be valid UTF-8.
 617
 618    {b Creating readers:}
 619    {[
 620      open Bytesrw
 621
 622      (* From a string *)
 623      let reader = Bytes.Reader.of_string html_string
 624
 625      (* From a file *)
 626      let ic = open_in "page.html" in
 627      let reader = Bytes.Reader.of_in_channel ic
 628
 629      (* From a buffer *)
 630      let reader = Bytes.Reader.of_buffer buf
 631    ]}
 632
 633    {b Parsing a complete document:}
 634    {[
 635      let result = Html5rw.parse reader
 636      let doc = Html5rw.root result
 637    ]}
 638
 639    {b Parsing a fragment:}
 640    {[
 641      let ctx = Html5rw.make_fragment_context ~tag_name:"div" () in
 642      let result = Html5rw.parse ~fragment_context:ctx reader
 643    ]}
 644
 645    @param collect_errors If [true], collect parse errors. Default: [false].
 646           Error collection has some performance overhead.
 647    @param fragment_context Context element for fragment parsing. If provided,
 648           the input is parsed as a fragment (like innerHTML) rather than
 649           a complete document.
 650
 651    @see <https://html.spec.whatwg.org/multipage/parsing.html>
 652         WHATWG: HTML parsing algorithm *)
 653val parse : ?collect_errors:bool -> ?fragment_context:fragment_context ->
 654  Bytesrw.Bytes.Reader.t -> t
 655
 656(** Parse raw bytes with automatic encoding detection.
 657
 658    This function is useful when you have raw bytes and don't know the
 659    character encoding. It implements the WHATWG encoding sniffing algorithm:
 660
 661    1. {b BOM detection}: Check for UTF-8, UTF-16LE, or UTF-16BE BOM
 662    2. {b Prescan}: Look for [<meta charset="...">] in the first 1024 bytes
 663    3. {b Transport hint}: Use the provided [transport_encoding] if any
 664    4. {b Fallback}: Use UTF-8 (the modern web default)
 665
 666    The detected encoding is stored in the result's [encoding] field.
 667
 668    {b Example:}
 669    {[
 670      let bytes = really_input_bytes ic (in_channel_length ic) in
 671      let result = Html5rw.parse_bytes bytes in
 672      match Html5rw.encoding result with
 673      | Some Utf8 -> print_endline "UTF-8 detected"
 674      | Some Windows_1252 -> print_endline "Windows-1252 detected"
 675      | _ -> ()
 676    ]}
 677
 678    @param collect_errors If [true], collect parse errors. Default: [false].
 679    @param transport_encoding Encoding hint from HTTP Content-Type header.
 680           For example, if the server sends [Content-Type: text/html; charset=utf-8],
 681           pass [~transport_encoding:"utf-8"].
 682    @param fragment_context Context element for fragment parsing.
 683
 684    @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
 685         WHATWG: Determining the character encoding *)
 686val parse_bytes : ?collect_errors:bool -> ?transport_encoding:string ->
 687  ?fragment_context:fragment_context -> bytes -> t
 688
 689(** {1 Querying} *)
 690
 691(** Query the DOM tree with a CSS selector.
 692
 693    CSS selectors are patterns used to select elements in HTML documents.
 694    This function returns all nodes matching the selector, in document order.
 695
 696    {b Supported selectors:}
 697
 698    {i Type selectors:}
 699    - [div], [p], [span] - elements by tag name
 700
 701    {i Class and ID selectors:}
 702    - [#myid] - element with [id="myid"]
 703    - [.myclass] - elements with class containing "myclass"
 704
 705    {i Attribute selectors:}
 706    - [[attr]] - elements with the [attr] attribute
 707    - [[attr="value"]] - attribute equals value
 708    - [[attr~="value"]] - attribute contains word
 709    - [[attr|="value"]] - attribute starts with value or value-
 710    - [[attr^="value"]] - attribute starts with value
 711    - [[attr$="value"]] - attribute ends with value
 712    - [[attr*="value"]] - attribute contains value
 713
 714    {i Pseudo-classes:}
 715    - [:first-child], [:last-child] - first/last child of parent
 716    - [:nth-child(n)] - nth child (1-indexed)
 717    - [:only-child] - only child of parent
 718    - [:empty] - elements with no children
 719    - [:not(selector)] - elements not matching selector
 720
 721    {i Combinators:}
 722    - [A B] - B descendants of A (any depth)
 723    - [A > B] - B direct children of A
 724    - [A + B] - B immediately after A (adjacent sibling)
 725    - [A ~ B] - B after A (general sibling)
 726
 727    {i Universal:}
 728    - [*] - all elements
 729
 730    {b Examples:}
 731    {[
 732      (* All paragraphs *)
 733      let ps = query result "p"
 734
 735      (* Elements with class "warning" inside a div *)
 736      let warnings = query result "div .warning"
 737
 738      (* Direct children of nav that are links *)
 739      let nav_links = query result "nav > a"
 740
 741      (* Complex selector *)
 742      let items = query result "ul.menu > li:first-child a[href]"
 743    ]}
 744
 745    @raise Selector.Selector_error if the selector syntax is invalid
 746
 747    @see <https://www.w3.org/TR/selectors-4/>
 748         W3C: Selectors Level 4 *)
 749val query : t -> string -> node list
 750
 751(** Check if a node matches a CSS selector.
 752
 753    This is useful for filtering nodes or implementing custom traversals.
 754
 755    {b Example:}
 756    {[
 757      let is_external_link node =
 758        matches node "a[href^='http']"
 759    ]}
 760
 761    @raise Selector.Selector_error if the selector syntax is invalid *)
 762val matches : node -> string -> bool
 763
 764(** {1 Serialization} *)
 765
 766(** Write the DOM tree to a [Bytes.Writer.t].
 767
 768    This serializes the DOM back to HTML. The output is valid HTML5 that
 769    can be parsed to produce an equivalent DOM tree.
 770
 771    {b Example:}
 772    {[
 773      open Bytesrw
 774      let buf = Buffer.create 1024 in
 775      let writer = Bytes.Writer.of_buffer buf in
 776      Html5rw.to_writer result writer;
 777      Bytes.Writer.write_eod writer;
 778      let html = Buffer.contents buf
 779    ]}
 780
 781    @param pretty If [true] (default), add indentation for readability.
 782           If [false], output compact HTML with no added whitespace.
 783    @param indent_size Spaces per indentation level (default: 2).
 784           Only used when [pretty] is [true].
 785
 786    @see <https://html.spec.whatwg.org/multipage/parsing.html#serialising-html-fragments>
 787         WHATWG: Serialising HTML fragments *)
 788val to_writer : ?pretty:bool -> ?indent_size:int -> t ->
 789  Bytesrw.Bytes.Writer.t -> unit
 790
 791(** Serialize the DOM tree to a string.
 792
 793    Convenience function that serializes to a string instead of a writer.
 794    Use {!to_writer} for large documents to avoid memory allocation.
 795
 796    @param pretty If [true] (default), add indentation for readability.
 797    @param indent_size Spaces per indentation level (default: 2). *)
 798val to_string : ?pretty:bool -> ?indent_size:int -> t -> string
 799
 800(** Extract text content from the DOM tree.
 801
 802    This concatenates all text nodes in the document, producing a string
 803    with just the readable text (no HTML tags).
 804
 805    {b Example:}
 806    {[
 807      (* For document: <div><p>Hello</p><p>World</p></div> *)
 808      let text = to_text result
 809      (* Returns: "Hello World" *)
 810    ]}
 811
 812    @param separator String to insert between text nodes (default: [" "])
 813    @param strip If [true] (default), trim leading/trailing whitespace *)
 814val to_text : ?separator:string -> ?strip:bool -> t -> string
 815
 816(** Serialize to html5lib test format.
 817
 818    This produces the tree format used by the
 819    {{:https://github.com/html5lib/html5lib-tests} html5lib-tests} suite.
 820    Mainly useful for testing the parser against the reference tests. *)
 821val to_test_format : t -> string
 822
 823(** {1 Result Accessors} *)
 824
 825(** Get the root node of the parsed document.
 826
 827    For full document parsing, this returns a Document node. The structure is:
 828    {v
 829    #document
 830    ├── !doctype (if present)
 831    └── html
 832        ├── head
 833        └── body
 834    v}
 835
 836    For fragment parsing, this returns a Document Fragment node containing
 837    the parsed elements directly. *)
 838val root : t -> node
 839
 840(** Get parse errors (if error collection was enabled).
 841
 842    Returns an empty list if [~collect_errors:true] was not passed to the
 843    parse function, or if the document was well-formed.
 844
 845    Errors are returned in the order they were encountered during parsing.
 846
 847    @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
 848         WHATWG: Parse errors *)
 849val errors : t -> parse_error list
 850
 851(** Get the detected encoding (if parsed from bytes).
 852
 853    Returns [Some encoding] when {!parse_bytes} was used, indicating which
 854    encoding was detected or specified. Returns [None] when {!parse} was
 855    used, since it expects pre-decoded UTF-8 input.
 856
 857    @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
 858         WHATWG: Determining the character encoding *)
 859val encoding : t -> encoding option
 860
 861(** {1 DOM Utilities}
 862
 863    Common DOM operations are available directly on this module. For the
 864    full API including more advanced operations, see the {!Dom} module.
 865
 866    @see <https://html.spec.whatwg.org/multipage/dom.html>
 867         WHATWG: The elements of HTML
 868*)
 869
 870(** Create an element node.
 871
 872    Elements are the building blocks of HTML documents. They represent tags
 873    like [<div>], [<p>], [<a>], etc.
 874
 875    @param name Tag name (e.g., ["div"], ["p"], ["span"])
 876    @param namespace Element namespace:
 877           - [None] (default): HTML namespace
 878           - [Some "svg"]: SVG namespace for graphics
 879           - [Some "mathml"]: MathML namespace for math notation
 880    @param attrs Initial attributes as [(name, value)] pairs
 881
 882    {b Example:}
 883    {[
 884      (* Simple element *)
 885      let div = create_element "div" ()
 886
 887      (* Element with attributes *)
 888      let link = create_element "a"
 889        ~attrs:[("href", "/about"); ("class", "nav-link")]
 890        ()
 891    ]}
 892
 893    @see <https://html.spec.whatwg.org/multipage/dom.html#elements-in-the-dom>
 894         WHATWG: Elements in the DOM *)
 895val create_element : string -> ?namespace:string option ->
 896  ?attrs:(string * string) list -> ?location:Dom.location -> unit -> node
 897
 898(** Create a text node.
 899
 900    Text nodes contain the readable text content of HTML documents.
 901
 902    {b Example:}
 903    {[
 904      let text = create_text "Hello, world!"
 905    ]} *)
 906val create_text : ?location:Dom.location -> string -> node
 907
 908(** Create a comment node.
 909
 910    Comments are preserved in the DOM but not rendered. They're written
 911    as [<!-- text -->] in HTML.
 912
 913    @see <https://html.spec.whatwg.org/multipage/syntax.html#comments>
 914         WHATWG: Comments *)
 915val create_comment : ?location:Dom.location -> string -> node
 916
 917(** Create an empty document node.
 918
 919    The Document node is the root of an HTML document tree.
 920
 921    @see <https://html.spec.whatwg.org/multipage/dom.html#document>
 922         WHATWG: The Document object *)
 923val create_document : unit -> node
 924
 925(** Create a document fragment node.
 926
 927    Document fragments are lightweight containers for holding nodes
 928    without a parent document. Used for template contents and fragment
 929    parsing results.
 930
 931    @see <https://dom.spec.whatwg.org/#documentfragment>
 932         DOM Standard: DocumentFragment *)
 933val create_document_fragment : unit -> node
 934
 935(** Create a doctype node.
 936
 937    For HTML5 documents, use [create_doctype ~name:"html" ()].
 938
 939    @param name DOCTYPE name (usually ["html"])
 940    @param public_id Public identifier (legacy)
 941    @param system_id System identifier (legacy)
 942
 943    @see <https://html.spec.whatwg.org/multipage/syntax.html#the-doctype>
 944         WHATWG: The DOCTYPE *)
 945val create_doctype : ?name:string -> ?public_id:string ->
 946  ?system_id:string -> ?location:location -> unit -> node
 947
 948(** Append a child node to a parent.
 949
 950    The child is added as the last child of the parent. If the child
 951    already has a parent, it is first removed from that parent. *)
 952val append_child : node -> node -> unit
 953
 954(** Insert a node before a reference node.
 955
 956    @param parent The parent node
 957    @param new_child The node to insert
 958    @param ref_child The existing child to insert before
 959
 960    Raises [Not_found] if [ref_child] is not a child of [parent]. *)
 961val insert_before : node -> node -> node -> unit
 962
 963(** Remove a child node from its parent.
 964
 965    Raises [Not_found] if [child] is not a child of [parent]. *)
 966val remove_child : node -> node -> unit
 967
 968(** Get an attribute value.
 969
 970    Returns [Some value] if the attribute exists, [None] otherwise.
 971    Attribute names are case-sensitive (but were lowercased during parsing).
 972
 973    @see <https://html.spec.whatwg.org/multipage/dom.html#attributes>
 974         WHATWG: Attributes *)
 975val get_attr : node -> string -> string option
 976
 977(** Set an attribute value.
 978
 979    If the attribute exists, it is replaced. If not, it is added. *)
 980val set_attr : node -> string -> string -> unit
 981
 982(** Check if a node has an attribute. *)
 983val has_attr : node -> string -> bool
 984
 985(** Get all descendant nodes in document order.
 986
 987    Returns all nodes below this node in the tree, in the order they
 988    appear in the HTML source (depth-first). *)
 989val descendants : node -> node list
 990
 991(** Get all ancestor nodes from parent to root.
 992
 993    Returns the chain of parent nodes, starting with the immediate parent
 994    and ending with the Document node. *)
 995val ancestors : node -> node list
 996
 997(** Get text content of a node and its descendants.
 998
 999    For text nodes, returns the text directly. For elements, recursively
1000    concatenates all descendant text content. *)
1001val get_text_content : node -> string
1002
1003(** Clone a node.
1004
1005    @param deep If [true], recursively clone all descendants.
1006           If [false] (default), only clone the node itself. *)
1007val clone : ?deep:bool -> node -> node
1008
1009(** {1 Node Predicates}
1010
1011    Functions to test what type of node you have.
1012*)
1013
1014(** Test if a node is an element.
1015
1016    Elements are HTML tags like [<div>], [<p>], [<a>]. *)
1017val is_element : node -> bool
1018
1019(** Test if a node is a text node.
1020
1021    Text nodes contain character content within elements. *)
1022val is_text : node -> bool
1023
1024(** Test if a node is a comment node.
1025
1026    Comment nodes represent HTML comments [<!-- ... -->]. *)
1027val is_comment : node -> bool
1028
1029(** Test if a node is a document node.
1030
1031    The document node is the root of a complete HTML document tree. *)
1032val is_document : node -> bool
1033
1034(** Test if a node is a document fragment.
1035
1036    Document fragments are lightweight containers for nodes. *)
1037val is_document_fragment : node -> bool
1038
1039(** Test if a node is a doctype node.
1040
1041    Doctype nodes represent the [<!DOCTYPE>] declaration. *)
1042val is_doctype : node -> bool
1043
1044(** Test if a node has children. *)
1045val has_children : node -> bool