lib/html5rw/html5rw.mli at 81c4816404ceafd6d88e08303e3870f364dc0a32 · anil.recoil.org/ocaml-html5rw

OCaml HTML5 parser/serialiser based on Python's JustHTML
ocaml-html5rw / lib / html5rw / html5rw.mli
at 81c4816404ceafd6d88e08303e3870f364dc0a32 35 kB view raw
   1(*---------------------------------------------------------------------------
   2  Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
   3  SPDX-License-Identifier: MIT
   4 ---------------------------------------------------------------------------*)
   5
   6(** Html5rw - Pure OCaml HTML5 Parser
   7
   8    This library provides a complete HTML5 parsing solution that implements the
   9    {{:https://html.spec.whatwg.org/multipage/parsing.html} WHATWG HTML5
  10    parsing specification}. It can parse any HTML document - well-formed or not -
  11    and produce a DOM (Document Object Model) tree that matches browser behavior.
  12
  13    {2 What is HTML?}
  14
  15    HTML (HyperText Markup Language) is the standard markup language for creating
  16    web pages. An HTML document consists of nested {i elements} that describe
  17    the structure and content of the page:
  18
  19    {v
  20    <!DOCTYPE html>
  21    <html>
  22      <head>
  23        <title>My Page</title>
  24      </head>
  25      <body>
  26        <h1>Welcome</h1>
  27        <p>Hello, <b>world</b>!</p>
  28      </body>
  29    </html>
  30    v}
  31
  32    Each element is written with a {i start tag} (like [<p>]), content, and an
  33    {i end tag} (like [</p>]). Elements can have {i attributes} that provide
  34    additional information: [<a href="https://example.com">].
  35
  36    @see <https://html.spec.whatwg.org/multipage/introduction.html>
  37         WHATWG: Introduction to HTML
  38
  39    {2 The DOM}
  40
  41    When this parser processes HTML, it doesn't just store the text. Instead,
  42    it builds a tree structure called the DOM (Document Object Model). Each
  43    element, text fragment, and comment becomes a {i node} in this tree:
  44
  45    {v
  46    Document
  47    └── html
  48        ├── head
  49        │   └── title
  50        │       └── #text "My Page"
  51        └── body
  52            ├── h1
  53            │   └── #text "Welcome"
  54            └── p
  55                ├── #text "Hello, "
  56                ├── b
  57                │   └── #text "world"
  58                └── #text "!"
  59    v}
  60
  61    This tree can be traversed, searched, and modified. The {!Dom} module
  62    provides types and functions for working with DOM nodes.
  63
  64    @see <https://html.spec.whatwg.org/multipage/dom.html>
  65         WHATWG: The elements of HTML (DOM chapter)
  66
  67    {2 Quick Start}
  68
  69    Parse HTML from a string:
  70    {[
  71      open Bytesrw
  72      let reader = Bytes.Reader.of_string "<p>Hello, world!</p>" in
  73      let result = Html5rw.parse reader in
  74      let html = Html5rw.to_string result
  75    ]}
  76
  77    Parse from a file:
  78    {[
  79      open Bytesrw
  80      let ic = open_in "page.html" in
  81      let reader = Bytes.Reader.of_in_channel ic in
  82      let result = Html5rw.parse reader in
  83      close_in ic
  84    ]}
  85
  86    Query with CSS selectors:
  87    {[
  88      let result = Html5rw.parse reader in
  89      let divs = Html5rw.query result "div.content"
  90    ]}
  91
  92    {2 Error Handling}
  93
  94    Unlike many parsers, HTML5 parsing {b never fails}. The WHATWG specification
  95    defines error recovery rules for every possible malformed input, ensuring
  96    all HTML documents produce a valid DOM tree (just as browsers do).
  97
  98    For example, parsing [<p>Hello<p>World] produces two paragraphs, not an
  99    error, because [<p>] implicitly closes the previous [<p>].
 100
 101    If you need to detect malformed HTML (e.g., for validation), enable error
 102    collection with [~collect_errors:true]. Errors are advisory - the parsing
 103    still succeeds.
 104
 105    @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
 106         WHATWG: Parse errors
 107
 108    {2 HTML vs XHTML}
 109
 110    This parser implements {b HTML5 parsing}, not XHTML parsing. Key differences:
 111
 112    - Tag and attribute names are case-insensitive ([<DIV>] equals [<div>])
 113    - Some end tags are optional ([<p>Hello] is valid)
 114    - Void elements have no end tag ([<br>], not [<br/>] or [<br></br>])
 115    - Boolean attributes need no value ([<input disabled>])
 116
 117    XHTML uses stricter XML rules. If you need XHTML parsing, use an XML parser.
 118
 119    @see <https://html.spec.whatwg.org/multipage/syntax.html>
 120         WHATWG: The HTML syntax
 121*)
 122
 123(** {1 Sub-modules} *)
 124
 125(** Parse error code types.
 126
 127    This module provides the {!Parse_error_code.t} variant type that represents
 128    all WHATWG-defined parse errors plus tree construction errors.
 129
 130    @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
 131         WHATWG: Parse errors *)
 132module Parse_error_code = Parse_error_code
 133
 134(** DOM types and manipulation functions.
 135
 136    This module provides the core types for representing HTML documents as
 137    DOM trees. It includes:
 138    - The {!Dom.node} type representing all kinds of DOM nodes
 139    - Functions to create, modify, and traverse nodes
 140    - Serialization functions to convert DOM back to HTML
 141
 142    @see <https://html.spec.whatwg.org/multipage/dom.html>
 143         WHATWG: The elements of HTML *)
 144module Dom = Dom
 145
 146(** HTML5 tokenizer.
 147
 148    The tokenizer is the first stage of HTML5 parsing. It converts a stream
 149    of characters into a stream of {i tokens}: start tags, end tags, text,
 150    comments, and DOCTYPEs.
 151
 152    Most users don't need to use the tokenizer directly - the {!parse}
 153    function handles everything. The tokenizer is exposed for advanced use
 154    cases like syntax highlighting or partial parsing.
 155
 156    @see <https://html.spec.whatwg.org/multipage/parsing.html#tokenization>
 157         WHATWG: Tokenization *)
 158module Tokenizer = Tokenizer
 159
 160(** Encoding detection and decoding.
 161
 162    HTML documents can use various character encodings (UTF-8, ISO-8859-1,
 163    etc.). This module implements the WHATWG encoding sniffing algorithm
 164    that browsers use to detect the encoding of a document:
 165
 166    1. Check for a BOM (Byte Order Mark)
 167    2. Look for a [<meta charset>] declaration
 168    3. Use HTTP Content-Type header hint (if available)
 169    4. Fall back to UTF-8
 170
 171    @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
 172         WHATWG: Determining the character encoding
 173    @see <https://encoding.spec.whatwg.org/>
 174         WHATWG Encoding Standard *)
 175module Encoding = Encoding
 176
 177(** CSS selector engine.
 178
 179    This module provides CSS selector support for querying the DOM tree.
 180    CSS selectors are patterns used to select HTML elements based on their
 181    tag names, attributes, classes, IDs, and position in the document.
 182
 183    Example selectors:
 184    - [div] - all [<div>] elements
 185    - [#header] - element with [id="header"]
 186    - [.warning] - elements with [class="warning"]
 187    - [div > p] - [<p>] elements that are direct children of [<div>]
 188    - [[href]] - elements with an [href] attribute
 189
 190    @see <https://www.w3.org/TR/selectors-4/>
 191         W3C Selectors Level 4 specification *)
 192module Selector = Selector
 193
 194(** HTML entity decoding.
 195
 196    HTML uses {i character references} to represent characters that are
 197    hard to type or have special meaning:
 198
 199    - Named references: [&amp;] (ampersand), [&lt;] (less than), [&nbsp;] (non-breaking space)
 200    - Decimal references: [&#60;] (less than as decimal 60)
 201    - Hexadecimal references: [&#x3C;] (less than as hex 3C)
 202
 203    This module decodes all 2,231 named character references defined in
 204    the WHATWG specification, plus numeric references.
 205
 206    @see <https://html.spec.whatwg.org/multipage/named-characters.html>
 207         WHATWG: Named character references *)
 208module Entities = Entities
 209
 210(** Low-level parser access.
 211
 212    This module exposes the internals of the HTML5 parser for advanced use.
 213    Most users should use the top-level {!parse} function instead.
 214
 215    The parser exposes:
 216    - Insertion modes for the tree construction algorithm
 217    - The tree builder state machine
 218    - Lower-level parsing functions
 219
 220    @see <https://html.spec.whatwg.org/multipage/parsing.html#tree-construction>
 221         WHATWG: Tree construction *)
 222module Parser = Parser
 223
 224(** {1 Core Types} *)
 225
 226(** DOM node type.
 227
 228    A node represents one part of an HTML document. Nodes form a tree
 229    structure with parent/child relationships. There are several kinds:
 230
 231    - {b Element nodes}: HTML tags like [<div>], [<p>], [<a>]
 232    - {b Text nodes}: Text content within elements
 233    - {b Comment nodes}: HTML comments [<!-- ... -->]
 234    - {b Document nodes}: The root of a document tree
 235    - {b Document fragment nodes}: Lightweight containers
 236    - {b Doctype nodes}: The [<!DOCTYPE html>] declaration
 237
 238    See {!Dom} for manipulation functions.
 239
 240    @see <https://html.spec.whatwg.org/multipage/dom.html>
 241         WHATWG: The DOM *)
 242type node = Dom.node
 243
 244val pp_node : Format.formatter -> node -> unit
 245(** Pretty-print a DOM node. Prints a summary representation showing the
 246    node type and key attributes. Does not recursively print children. *)
 247
 248(** DOCTYPE information.
 249
 250    The DOCTYPE declaration ([<!DOCTYPE html>]) appears at the start of HTML
 251    documents. It tells browsers to use standards mode for rendering.
 252
 253    In HTML5, the DOCTYPE is minimal - just [<!DOCTYPE html>] with no public
 254    or system identifiers. Legacy DOCTYPEs may have additional fields.
 255
 256    @see <https://html.spec.whatwg.org/multipage/syntax.html#the-doctype>
 257         WHATWG: The DOCTYPE *)
 258type doctype_data = Dom.doctype_data = {
 259  name : string option;
 260  (** DOCTYPE name, typically ["html"] *)
 261
 262  public_id : string option;
 263  (** Public identifier for legacy DOCTYPEs (e.g., XHTML, HTML4) *)
 264
 265  system_id : string option;
 266  (** System identifier (URL) for legacy DOCTYPEs *)
 267}
 268
 269val pp_doctype_data : Format.formatter -> doctype_data -> unit
 270(** Pretty-print DOCTYPE data. *)
 271
 272(** Quirks mode as determined during parsing.
 273
 274    {i Quirks mode} controls how browsers render CSS and compute layouts.
 275    It exists for backwards compatibility with old web pages that relied
 276    on browser bugs.
 277
 278    - {b No_quirks}: Standards mode. The document is rendered according to
 279      modern HTML5 and CSS specifications. Triggered by [<!DOCTYPE html>].
 280
 281    - {b Quirks}: Full quirks mode. The browser emulates bugs from older
 282      browsers (primarily IE5). Triggered by missing or malformed DOCTYPEs.
 283      Affects CSS box model, table layout, font inheritance, and more.
 284
 285    - {b Limited_quirks}: Almost standards mode. Only a few specific quirks
 286      are applied, mainly affecting table cell vertical alignment.
 287
 288    {b Recommendation:} Always use [<!DOCTYPE html>] to ensure standards mode.
 289
 290    @see <https://quirks.spec.whatwg.org/>
 291         Quirks Mode Standard
 292    @see <https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode>
 293         WHATWG: How quirks mode is determined *)
 294type quirks_mode = Dom.quirks_mode = No_quirks | Quirks | Limited_quirks
 295
 296val pp_quirks_mode : Format.formatter -> quirks_mode -> unit
 297(** Pretty-print quirks mode. *)
 298
 299(** Character encoding detected or specified.
 300
 301    HTML documents are sequences of bytes that must be decoded into characters.
 302    Different encodings interpret the same bytes differently. For example:
 303
 304    - UTF-8: The modern standard, supporting all Unicode characters
 305    - Windows-1252: Common on older Western European web pages
 306    - ISO-8859-2: Used for Central European languages
 307    - UTF-16: Used by some Windows applications
 308
 309    The parser detects encoding automatically when using {!parse_bytes}.
 310    The detected encoding is available via {!val-encoding}.
 311
 312    @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
 313         WHATWG: Determining the character encoding
 314    @see <https://encoding.spec.whatwg.org/>
 315         WHATWG Encoding Standard *)
 316type encoding = Encoding.encoding =
 317  | Utf8
 318  (** UTF-8: The dominant encoding for the web, supporting all Unicode *)
 319
 320  | Utf16le
 321  (** UTF-16 Little-Endian: 16-bit encoding, used by Windows *)
 322
 323  | Utf16be
 324  (** UTF-16 Big-Endian: 16-bit encoding, network byte order *)
 325
 326  | Windows_1252
 327  (** Windows-1252 (CP-1252): Western European, superset of ISO-8859-1 *)
 328
 329  | Iso_8859_2
 330  (** ISO-8859-2: Central European (Polish, Czech, Hungarian, etc.) *)
 331
 332  | Euc_jp
 333  (** EUC-JP: Extended Unix Code for Japanese *)
 334
 335val pp_encoding : Format.formatter -> encoding -> unit
 336(** Pretty-print an encoding using its canonical label. *)
 337
 338(** A parse error encountered during HTML5 parsing.
 339
 340    HTML5 parsing {b never fails} - the specification defines error recovery
 341    for all malformed input. However, conformance checkers can report these
 342    errors. Enable error collection with [~collect_errors:true] if you want
 343    to detect malformed HTML.
 344
 345    {b Common parse errors:}
 346
 347    - ["unexpected-null-character"]: Null byte in the input
 348    - ["eof-before-tag-name"]: File ended while reading a tag
 349    - ["unexpected-character-in-attribute-name"]: Invalid attribute syntax
 350    - ["missing-doctype"]: Document started without [<!DOCTYPE>]
 351    - ["duplicate-attribute"]: Same attribute appears twice on an element
 352
 353    The full list of parse error codes is defined in the WHATWG specification.
 354
 355    @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
 356         WHATWG: Complete list of parse errors *)
 357type parse_error = Parser.parse_error
 358
 359(** Get the error code.
 360
 361    Returns the {!Parse_error_code.t} variant representing this error.
 362    This allows pattern matching on specific error types:
 363
 364    {[
 365      match Html5rw.error_code err with
 366      | Parse_error_code.Unexpected_null_character -> (* handle *)
 367      | Parse_error_code.Eof_in_tag -> (* handle *)
 368      | Parse_error_code.Tree_construction_error msg -> (* handle tree error *)
 369      | _ -> (* other *)
 370    ]}
 371
 372    Use {!Parse_error_code.to_string} to convert to a string representation.
 373
 374    @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
 375         WHATWG: Parse error codes *)
 376val error_code : parse_error -> Parse_error_code.t
 377
 378(** Get the line number where the error occurred (1-indexed).
 379
 380    Line numbers count from 1 and increment at each newline character. *)
 381val error_line : parse_error -> int
 382
 383(** Get the column number where the error occurred (1-indexed).
 384
 385    Column numbers count from 1 and reset at each newline. *)
 386val error_column : parse_error -> int
 387
 388val pp_parse_error : Format.formatter -> parse_error -> unit
 389(** Pretty-print a parse error with location information. *)
 390
 391(** {1 Error Handling} *)
 392
 393(** Global error type that wraps all errors raised by the Html5rw library.
 394
 395    This module provides a unified error type for all parsing and selector
 396    errors, along with printers and conversion functions. Use this when you
 397    want to handle all possible errors from the library in a uniform way.
 398
 399    {2 Usage}
 400
 401    {[
 402      (* Converting parse errors *)
 403      let errors = Html5rw.errors result in
 404      List.iter (fun err ->
 405        let unified = Html5rw.Error.of_parse_error err in
 406        Printf.eprintf "%s\n" (Html5rw.Error.to_string unified)
 407      ) errors
 408
 409      (* Catching selector errors *)
 410      match Html5rw.query result selector with
 411      | nodes -> (* success *)
 412      | exception Html5rw.Selector.Selector_error code ->
 413        let unified = Html5rw.Error.of_selector_error code in
 414        Printf.eprintf "%s\n" (Html5rw.Error.to_string unified)
 415    ]}
 416*)
 417module Error : sig
 418  (** The unified error type for the Html5rw library. *)
 419  type t =
 420    | Parse_error of {
 421        code : Parse_error_code.t;
 422        line : int;
 423        column : int;
 424      }
 425        (** An HTML parse error, including location information.
 426
 427            Parse errors occur during HTML tokenization and tree construction.
 428            The location indicates where in the input the error was detected.
 429
 430            @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
 431                 WHATWG: Parse errors *)
 432
 433    | Selector_error of Selector.Error_code.t
 434        (** A CSS selector parse error.
 435
 436            Selector errors occur when parsing malformed CSS selectors passed
 437            to {!query} or {!matches}. *)
 438
 439  val of_parse_error : parse_error -> t
 440  (** Convert a parse error to the unified error type.
 441
 442      {[
 443        let errors = Html5rw.errors result in
 444        let unified_errors = List.map Html5rw.Error.of_parse_error errors
 445      ]} *)
 446
 447  val of_selector_error : Selector.Error_code.t -> t
 448  (** Convert a selector error code to the unified error type.
 449
 450      {[
 451        match Html5rw.query result "invalid[" with
 452        | _ -> ()
 453        | exception Html5rw.Selector.Selector_error code ->
 454          let err = Html5rw.Error.of_selector_error code in
 455          Printf.eprintf "%s\n" (Html5rw.Error.to_string err)
 456      ]} *)
 457
 458  val to_string : t -> string
 459  (** Convert to a human-readable error message with location information.
 460
 461      Examples:
 462      - ["Parse error at 5:12: unexpected-null-character"]
 463      - ["Selector error: Expected \]"] *)
 464
 465  val pp : Format.formatter -> t -> unit
 466  (** Pretty-printer for use with [Format] functions. *)
 467
 468  val code_string : t -> string
 469  (** Get just the error code as a kebab-case string (without location).
 470
 471      This is useful for programmatic error handling or logging.
 472
 473      Examples:
 474      - ["unexpected-null-character"]
 475      - ["expected-closing-bracket"] *)
 476end
 477
 478(** {1 Fragment Parsing} *)
 479
 480(** Context element for HTML fragment parsing (innerHTML).
 481
 482    When parsing HTML fragments (like the [innerHTML] of an element), you
 483    must specify what element would contain the fragment. This affects how
 484    the parser handles certain elements.
 485
 486    {b Why context matters:}
 487
 488    HTML parsing rules depend on where content appears. For example:
 489    - [<td>] is valid inside [<tr>] but not inside [<div>]
 490    - [<li>] is valid inside [<ul>] but creates implied lists elsewhere
 491    - Content inside [<table>] has special parsing rules
 492
 493    {b Example:}
 494    {[
 495      (* Parse as if content were inside a <ul> *)
 496      let ctx = make_fragment_context ~tag_name:"ul" () in
 497      let result = parse ~fragment_context:ctx reader
 498      (* Now <li> elements are parsed correctly *)
 499    ]}
 500
 501    @see <https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments>
 502         WHATWG: The fragment parsing algorithm *)
 503type fragment_context = Parser.fragment_context
 504
 505(** Create a fragment parsing context.
 506
 507    The context element determines how the parser interprets the fragment.
 508    Choose a context that matches where the fragment would be inserted.
 509
 510    @param tag_name Tag name of the context element (e.g., ["div"], ["tr"],
 511           ["ul"]). This is the element that would contain the fragment.
 512    @param namespace Namespace of the context element:
 513           - [None] (default): HTML namespace
 514           - [Some "svg"]: SVG namespace
 515           - [Some "mathml"]: MathML namespace
 516
 517    {b Examples:}
 518    {[
 519      (* Parse as innerHTML of a <div> (most common case) *)
 520      let ctx = make_fragment_context ~tag_name:"div" ()
 521
 522      (* Parse as innerHTML of a <ul> - <li> elements work correctly *)
 523      let ctx = make_fragment_context ~tag_name:"ul" ()
 524
 525      (* Parse as innerHTML of an SVG <g> element *)
 526      let ctx = make_fragment_context ~tag_name:"g" ~namespace:(Some "svg") ()
 527
 528      (* Parse as innerHTML of a <table> - table-specific rules apply *)
 529      let ctx = make_fragment_context ~tag_name:"table" ()
 530    ]}
 531
 532    @see <https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments>
 533         WHATWG: Fragment parsing algorithm *)
 534val make_fragment_context : tag_name:string -> ?namespace:string option ->
 535  unit -> fragment_context
 536
 537(** Get the tag name of a fragment context. *)
 538val fragment_context_tag : fragment_context -> string
 539
 540(** Get the namespace of a fragment context. *)
 541val fragment_context_namespace : fragment_context -> string option
 542
 543val pp_fragment_context : Format.formatter -> fragment_context -> unit
 544(** Pretty-print a fragment context. *)
 545
 546(** Result of parsing an HTML document.
 547
 548    This record contains everything produced by parsing:
 549    - The DOM tree (accessible via {!val-root})
 550    - Any parse errors (accessible via {!val-errors})
 551    - The detected encoding (accessible via {!val-encoding})
 552*)
 553type t = {
 554  root : node;
 555  (** Root node of the parsed document tree.
 556
 557      For full document parsing, this is a Document node containing the
 558      DOCTYPE (if any) and [<html>] element.
 559
 560      For fragment parsing, this is a Document Fragment containing the
 561      parsed elements. *)
 562
 563  errors : parse_error list;
 564  (** Parse errors encountered during parsing.
 565
 566      This list is empty unless [~collect_errors:true] was passed to the
 567      parse function. Errors are in the order they were encountered.
 568
 569      @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
 570           WHATWG: Parse errors *)
 571
 572  encoding : encoding option;
 573  (** Character encoding detected during parsing.
 574
 575      This is [Some encoding] when using {!parse_bytes} with automatic
 576      encoding detection, and [None] when using {!parse} (which expects
 577      pre-decoded UTF-8 input). *)
 578}
 579
 580val pp : Format.formatter -> t -> unit
 581(** Pretty-print a parse result summary. *)
 582
 583(** {1 Parsing Functions} *)
 584
 585(** Parse HTML from a [Bytes.Reader.t].
 586
 587    This is the primary parsing function. It reads bytes from the provided
 588    reader and returns a DOM tree. The input should be valid UTF-8.
 589
 590    {b Creating readers:}
 591    {[
 592      open Bytesrw
 593
 594      (* From a string *)
 595      let reader = Bytes.Reader.of_string html_string
 596
 597      (* From a file *)
 598      let ic = open_in "page.html" in
 599      let reader = Bytes.Reader.of_in_channel ic
 600
 601      (* From a buffer *)
 602      let reader = Bytes.Reader.of_buffer buf
 603    ]}
 604
 605    {b Parsing a complete document:}
 606    {[
 607      let result = Html5rw.parse reader
 608      let doc = Html5rw.root result
 609    ]}
 610
 611    {b Parsing a fragment:}
 612    {[
 613      let ctx = Html5rw.make_fragment_context ~tag_name:"div" () in
 614      let result = Html5rw.parse ~fragment_context:ctx reader
 615    ]}
 616
 617    @param collect_errors If [true], collect parse errors. Default: [false].
 618           Error collection has some performance overhead.
 619    @param fragment_context Context element for fragment parsing. If provided,
 620           the input is parsed as a fragment (like innerHTML) rather than
 621           a complete document.
 622
 623    @see <https://html.spec.whatwg.org/multipage/parsing.html>
 624         WHATWG: HTML parsing algorithm *)
 625val parse : ?collect_errors:bool -> ?fragment_context:fragment_context ->
 626  Bytesrw.Bytes.Reader.t -> t
 627
 628(** Parse raw bytes with automatic encoding detection.
 629
 630    This function is useful when you have raw bytes and don't know the
 631    character encoding. It implements the WHATWG encoding sniffing algorithm:
 632
 633    1. {b BOM detection}: Check for UTF-8, UTF-16LE, or UTF-16BE BOM
 634    2. {b Prescan}: Look for [<meta charset="...">] in the first 1024 bytes
 635    3. {b Transport hint}: Use the provided [transport_encoding] if any
 636    4. {b Fallback}: Use UTF-8 (the modern web default)
 637
 638    The detected encoding is stored in the result's [encoding] field.
 639
 640    {b Example:}
 641    {[
 642      let bytes = really_input_bytes ic (in_channel_length ic) in
 643      let result = Html5rw.parse_bytes bytes in
 644      match Html5rw.encoding result with
 645      | Some Utf8 -> print_endline "UTF-8 detected"
 646      | Some Windows_1252 -> print_endline "Windows-1252 detected"
 647      | _ -> ()
 648    ]}
 649
 650    @param collect_errors If [true], collect parse errors. Default: [false].
 651    @param transport_encoding Encoding hint from HTTP Content-Type header.
 652           For example, if the server sends [Content-Type: text/html; charset=utf-8],
 653           pass [~transport_encoding:"utf-8"].
 654    @param fragment_context Context element for fragment parsing.
 655
 656    @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
 657         WHATWG: Determining the character encoding *)
 658val parse_bytes : ?collect_errors:bool -> ?transport_encoding:string ->
 659  ?fragment_context:fragment_context -> bytes -> t
 660
 661(** {1 Querying} *)
 662
 663(** Query the DOM tree with a CSS selector.
 664
 665    CSS selectors are patterns used to select elements in HTML documents.
 666    This function returns all nodes matching the selector, in document order.
 667
 668    {b Supported selectors:}
 669
 670    {i Type selectors:}
 671    - [div], [p], [span] - elements by tag name
 672
 673    {i Class and ID selectors:}
 674    - [#myid] - element with [id="myid"]
 675    - [.myclass] - elements with class containing "myclass"
 676
 677    {i Attribute selectors:}
 678    - [[attr]] - elements with the [attr] attribute
 679    - [[attr="value"]] - attribute equals value
 680    - [[attr~="value"]] - attribute contains word
 681    - [[attr|="value"]] - attribute starts with value or value-
 682    - [[attr^="value"]] - attribute starts with value
 683    - [[attr$="value"]] - attribute ends with value
 684    - [[attr*="value"]] - attribute contains value
 685
 686    {i Pseudo-classes:}
 687    - [:first-child], [:last-child] - first/last child of parent
 688    - [:nth-child(n)] - nth child (1-indexed)
 689    - [:only-child] - only child of parent
 690    - [:empty] - elements with no children
 691    - [:not(selector)] - elements not matching selector
 692
 693    {i Combinators:}
 694    - [A B] - B descendants of A (any depth)
 695    - [A > B] - B direct children of A
 696    - [A + B] - B immediately after A (adjacent sibling)
 697    - [A ~ B] - B after A (general sibling)
 698
 699    {i Universal:}
 700    - [*] - all elements
 701
 702    {b Examples:}
 703    {[
 704      (* All paragraphs *)
 705      let ps = query result "p"
 706
 707      (* Elements with class "warning" inside a div *)
 708      let warnings = query result "div .warning"
 709
 710      (* Direct children of nav that are links *)
 711      let nav_links = query result "nav > a"
 712
 713      (* Complex selector *)
 714      let items = query result "ul.menu > li:first-child a[href]"
 715    ]}
 716
 717    @raise Selector.Selector_error if the selector syntax is invalid
 718
 719    @see <https://www.w3.org/TR/selectors-4/>
 720         W3C: Selectors Level 4 *)
 721val query : t -> string -> node list
 722
 723(** Check if a node matches a CSS selector.
 724
 725    This is useful for filtering nodes or implementing custom traversals.
 726
 727    {b Example:}
 728    {[
 729      let is_external_link node =
 730        matches node "a[href^='http']"
 731    ]}
 732
 733    @raise Selector.Selector_error if the selector syntax is invalid *)
 734val matches : node -> string -> bool
 735
 736(** {1 Serialization} *)
 737
 738(** Write the DOM tree to a [Bytes.Writer.t].
 739
 740    This serializes the DOM back to HTML. The output is valid HTML5 that
 741    can be parsed to produce an equivalent DOM tree.
 742
 743    {b Example:}
 744    {[
 745      open Bytesrw
 746      let buf = Buffer.create 1024 in
 747      let writer = Bytes.Writer.of_buffer buf in
 748      Html5rw.to_writer result writer;
 749      Bytes.Writer.write_eod writer;
 750      let html = Buffer.contents buf
 751    ]}
 752
 753    @param pretty If [true] (default), add indentation for readability.
 754           If [false], output compact HTML with no added whitespace.
 755    @param indent_size Spaces per indentation level (default: 2).
 756           Only used when [pretty] is [true].
 757
 758    @see <https://html.spec.whatwg.org/multipage/parsing.html#serialising-html-fragments>
 759         WHATWG: Serialising HTML fragments *)
 760val to_writer : ?pretty:bool -> ?indent_size:int -> t ->
 761  Bytesrw.Bytes.Writer.t -> unit
 762
 763(** Serialize the DOM tree to a string.
 764
 765    Convenience function that serializes to a string instead of a writer.
 766    Use {!to_writer} for large documents to avoid memory allocation.
 767
 768    @param pretty If [true] (default), add indentation for readability.
 769    @param indent_size Spaces per indentation level (default: 2). *)
 770val to_string : ?pretty:bool -> ?indent_size:int -> t -> string
 771
 772(** Extract text content from the DOM tree.
 773
 774    This concatenates all text nodes in the document, producing a string
 775    with just the readable text (no HTML tags).
 776
 777    {b Example:}
 778    {[
 779      (* For document: <div><p>Hello</p><p>World</p></div> *)
 780      let text = to_text result
 781      (* Returns: "Hello World" *)
 782    ]}
 783
 784    @param separator String to insert between text nodes (default: [" "])
 785    @param strip If [true] (default), trim leading/trailing whitespace *)
 786val to_text : ?separator:string -> ?strip:bool -> t -> string
 787
 788(** Serialize to html5lib test format.
 789
 790    This produces the tree format used by the
 791    {{:https://github.com/html5lib/html5lib-tests} html5lib-tests} suite.
 792    Mainly useful for testing the parser against the reference tests. *)
 793val to_test_format : t -> string
 794
 795(** {1 Result Accessors} *)
 796
 797(** Get the root node of the parsed document.
 798
 799    For full document parsing, this returns a Document node. The structure is:
 800    {v
 801    #document
 802    ├── !doctype (if present)
 803    └── html
 804        ├── head
 805        └── body
 806    v}
 807
 808    For fragment parsing, this returns a Document Fragment node containing
 809    the parsed elements directly. *)
 810val root : t -> node
 811
 812(** Get parse errors (if error collection was enabled).
 813
 814    Returns an empty list if [~collect_errors:true] was not passed to the
 815    parse function, or if the document was well-formed.
 816
 817    Errors are returned in the order they were encountered during parsing.
 818
 819    @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
 820         WHATWG: Parse errors *)
 821val errors : t -> parse_error list
 822
 823(** Get the detected encoding (if parsed from bytes).
 824
 825    Returns [Some encoding] when {!parse_bytes} was used, indicating which
 826    encoding was detected or specified. Returns [None] when {!parse} was
 827    used, since it expects pre-decoded UTF-8 input.
 828
 829    @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
 830         WHATWG: Determining the character encoding *)
 831val encoding : t -> encoding option
 832
 833(** {1 DOM Utilities}
 834
 835    Common DOM operations are available directly on this module. For the
 836    full API including more advanced operations, see the {!Dom} module.
 837
 838    @see <https://html.spec.whatwg.org/multipage/dom.html>
 839         WHATWG: The elements of HTML
 840*)
 841
 842(** Create an element node.
 843
 844    Elements are the building blocks of HTML documents. They represent tags
 845    like [<div>], [<p>], [<a>], etc.
 846
 847    @param name Tag name (e.g., ["div"], ["p"], ["span"])
 848    @param namespace Element namespace:
 849           - [None] (default): HTML namespace
 850           - [Some "svg"]: SVG namespace for graphics
 851           - [Some "mathml"]: MathML namespace for math notation
 852    @param attrs Initial attributes as [(name, value)] pairs
 853
 854    {b Example:}
 855    {[
 856      (* Simple element *)
 857      let div = create_element "div" ()
 858
 859      (* Element with attributes *)
 860      let link = create_element "a"
 861        ~attrs:[("href", "/about"); ("class", "nav-link")]
 862        ()
 863    ]}
 864
 865    @see <https://html.spec.whatwg.org/multipage/dom.html#elements-in-the-dom>
 866         WHATWG: Elements in the DOM *)
 867val create_element : string -> ?namespace:string option ->
 868  ?attrs:(string * string) list -> unit -> node
 869
 870(** Create a text node.
 871
 872    Text nodes contain the readable text content of HTML documents.
 873
 874    {b Example:}
 875    {[
 876      let text = create_text "Hello, world!"
 877    ]} *)
 878val create_text : string -> node
 879
 880(** Create a comment node.
 881
 882    Comments are preserved in the DOM but not rendered. They're written
 883    as [<!-- text -->] in HTML.
 884
 885    @see <https://html.spec.whatwg.org/multipage/syntax.html#comments>
 886         WHATWG: Comments *)
 887val create_comment : string -> node
 888
 889(** Create an empty document node.
 890
 891    The Document node is the root of an HTML document tree.
 892
 893    @see <https://html.spec.whatwg.org/multipage/dom.html#document>
 894         WHATWG: The Document object *)
 895val create_document : unit -> node
 896
 897(** Create a document fragment node.
 898
 899    Document fragments are lightweight containers for holding nodes
 900    without a parent document. Used for template contents and fragment
 901    parsing results.
 902
 903    @see <https://dom.spec.whatwg.org/#documentfragment>
 904         DOM Standard: DocumentFragment *)
 905val create_document_fragment : unit -> node
 906
 907(** Create a doctype node.
 908
 909    For HTML5 documents, use [create_doctype ~name:"html" ()].
 910
 911    @param name DOCTYPE name (usually ["html"])
 912    @param public_id Public identifier (legacy)
 913    @param system_id System identifier (legacy)
 914
 915    @see <https://html.spec.whatwg.org/multipage/syntax.html#the-doctype>
 916         WHATWG: The DOCTYPE *)
 917val create_doctype : ?name:string -> ?public_id:string ->
 918  ?system_id:string -> unit -> node
 919
 920(** Append a child node to a parent.
 921
 922    The child is added as the last child of the parent. If the child
 923    already has a parent, it is first removed from that parent. *)
 924val append_child : node -> node -> unit
 925
 926(** Insert a node before a reference node.
 927
 928    @param parent The parent node
 929    @param new_child The node to insert
 930    @param ref_child The existing child to insert before
 931
 932    Raises [Not_found] if [ref_child] is not a child of [parent]. *)
 933val insert_before : node -> node -> node -> unit
 934
 935(** Remove a child node from its parent.
 936
 937    Raises [Not_found] if [child] is not a child of [parent]. *)
 938val remove_child : node -> node -> unit
 939
 940(** Get an attribute value.
 941
 942    Returns [Some value] if the attribute exists, [None] otherwise.
 943    Attribute names are case-sensitive (but were lowercased during parsing).
 944
 945    @see <https://html.spec.whatwg.org/multipage/dom.html#attributes>
 946         WHATWG: Attributes *)
 947val get_attr : node -> string -> string option
 948
 949(** Set an attribute value.
 950
 951    If the attribute exists, it is replaced. If not, it is added. *)
 952val set_attr : node -> string -> string -> unit
 953
 954(** Check if a node has an attribute. *)
 955val has_attr : node -> string -> bool
 956
 957(** Get all descendant nodes in document order.
 958
 959    Returns all nodes below this node in the tree, in the order they
 960    appear in the HTML source (depth-first). *)
 961val descendants : node -> node list
 962
 963(** Get all ancestor nodes from parent to root.
 964
 965    Returns the chain of parent nodes, starting with the immediate parent
 966    and ending with the Document node. *)
 967val ancestors : node -> node list
 968
 969(** Get text content of a node and its descendants.
 970
 971    For text nodes, returns the text directly. For elements, recursively
 972    concatenates all descendant text content. *)
 973val get_text_content : node -> string
 974
 975(** Clone a node.
 976
 977    @param deep If [true], recursively clone all descendants.
 978           If [false] (default), only clone the node itself. *)
 979val clone : ?deep:bool -> node -> node
 980
 981(** {1 Node Predicates}
 982
 983    Functions to test what type of node you have.
 984*)
 985
 986(** Test if a node is an element.
 987
 988    Elements are HTML tags like [<div>], [<p>], [<a>]. *)
 989val is_element : node -> bool
 990
 991(** Test if a node is a text node.
 992
 993    Text nodes contain character content within elements. *)
 994val is_text : node -> bool
 995
 996(** Test if a node is a comment node.
 997
 998    Comment nodes represent HTML comments [<!-- ... -->]. *)
 999val is_comment : node -> bool
1000
1001(** Test if a node is a document node.
1002
1003    The document node is the root of a complete HTML document tree. *)
1004val is_document : node -> bool
1005
1006(** Test if a node is a document fragment.
1007
1008    Document fragments are lightweight containers for nodes. *)
1009val is_document_fragment : node -> bool
1010
1011(** Test if a node is a doctype node.
1012
1013    Doctype nodes represent the [<!DOCTYPE>] declaration. *)
1014val is_doctype : node -> bool
1015
1016(** Test if a node has children. *)
1017val has_children : node -> bool