(*--------------------------------------------------------------------------- Copyright (c) 2025 Anil Madhavapeddy . All rights reserved. SPDX-License-Identifier: MIT ---------------------------------------------------------------------------*) (** Html5rw - Pure OCaml HTML5 Parser This library provides a complete HTML5 parsing solution that implements the {{:https://html.spec.whatwg.org/multipage/parsing.html} WHATWG HTML5 parsing specification}. It can parse any HTML document - well-formed or not - and produce a DOM (Document Object Model) tree that matches browser behavior. {2 What is HTML?} HTML (HyperText Markup Language) is the standard markup language for creating web pages. An HTML document consists of nested {i elements} that describe the structure and content of the page: {v My Page

Welcome

Hello, world!

v} Each element is written with a {i start tag} (like [

]), content, and an {i end tag} (like [

]). Elements can have {i attributes} that provide additional information: []. @see WHATWG: Introduction to HTML {2 The DOM} When this parser processes HTML, it doesn't just store the text. Instead, it builds a tree structure called the DOM (Document Object Model). Each element, text fragment, and comment becomes a {i node} in this tree: {v Document └── html ├── head │ └── title │ └── #text "My Page" └── body ├── h1 │ └── #text "Welcome" └── p ├── #text "Hello, " ├── b │ └── #text "world" └── #text "!" v} This tree can be traversed, searched, and modified. The {!Dom} module provides types and functions for working with DOM nodes. @see WHATWG: The elements of HTML (DOM chapter) {2 Quick Start} Parse HTML from a string: {[ open Bytesrw let reader = Bytes.Reader.of_string "

Hello, world!

" in let result = Html5rw.parse reader in let html = Html5rw.to_string result ]} Parse from a file: {[ open Bytesrw let ic = open_in "page.html" in let reader = Bytes.Reader.of_in_channel ic in let result = Html5rw.parse reader in close_in ic ]} Query with CSS selectors: {[ let result = Html5rw.parse reader in let divs = Html5rw.query result "div.content" ]} {2 Error Handling} Unlike many parsers, HTML5 parsing {b never fails}. The WHATWG specification defines error recovery rules for every possible malformed input, ensuring all HTML documents produce a valid DOM tree (just as browsers do). For example, parsing [

Hello

World] produces two paragraphs, not an error, because [

] implicitly closes the previous [

]. If you need to detect malformed HTML (e.g., for validation), enable error collection with [~collect_errors:true]. Errors are advisory - the parsing still succeeds. @see WHATWG: Parse errors {2 HTML vs XHTML} This parser implements {b HTML5 parsing}, not XHTML parsing. Key differences: - Tag and attribute names are case-insensitive ([

] equals [
]) - Some end tags are optional ([

Hello] is valid) - Void elements have no end tag ([
], not [
] or [

]) - Boolean attributes need no value ([]) XHTML uses stricter XML rules. If you need XHTML parsing, use an XML parser. @see WHATWG: The HTML syntax *) (** {1 Sub-modules} *) (** Parse error code types. This module provides the {!Parse_error_code.t} variant type that represents all WHATWG-defined parse errors plus tree construction errors. @see WHATWG: Parse errors *) module Parse_error_code = Parse_error_code (** DOM types and manipulation functions. This module provides the core types for representing HTML documents as DOM trees. It includes: - The {!Dom.node} type representing all kinds of DOM nodes - Functions to create, modify, and traverse nodes - Serialization functions to convert DOM back to HTML @see WHATWG: The elements of HTML *) module Dom = Dom (** HTML5 tokenizer. The tokenizer is the first stage of HTML5 parsing. It converts a stream of characters into a stream of {i tokens}: start tags, end tags, text, comments, and DOCTYPEs. Most users don't need to use the tokenizer directly - the {!parse} function handles everything. The tokenizer is exposed for advanced use cases like syntax highlighting or partial parsing. @see WHATWG: Tokenization *) module Tokenizer = Tokenizer (** Encoding detection and decoding. HTML documents can use various character encodings (UTF-8, ISO-8859-1, etc.). This module implements the WHATWG encoding sniffing algorithm that browsers use to detect the encoding of a document: 1. Check for a BOM (Byte Order Mark) 2. Look for a [] declaration 3. Use HTTP Content-Type header hint (if available) 4. Fall back to UTF-8 @see WHATWG: Determining the character encoding @see WHATWG Encoding Standard *) module Encoding = Encoding (** CSS selector engine. This module provides CSS selector support for querying the DOM tree. CSS selectors are patterns used to select HTML elements based on their tag names, attributes, classes, IDs, and position in the document. Example selectors: - [div] - all [

] elements - [#header] - element with [id="header"] - [.warning] - elements with [class="warning"] - [div > p] - [

] elements that are direct children of [

] - [[href]] - elements with an [href] attribute @see W3C Selectors Level 4 specification *) module Selector = Selector (** HTML entity decoding. HTML uses {i character references} to represent characters that are hard to type or have special meaning: - Named references: [&] (ampersand), [<] (less than), [ ] (non-breaking space) - Decimal references: [<] (less than as decimal 60) - Hexadecimal references: [<] (less than as hex 3C) This module decodes all 2,231 named character references defined in the WHATWG specification, plus numeric references. @see WHATWG: Named character references *) module Entities = Entities (** Low-level parser access. This module exposes the internals of the HTML5 parser for advanced use. Most users should use the top-level {!parse} function instead. The parser exposes: - Insertion modes for the tree construction algorithm - The tree builder state machine - Lower-level parsing functions @see WHATWG: Tree construction *) module Parser = Parser (** {1 Core Types} *) (** DOM node type. A node represents one part of an HTML document. Nodes form a tree structure with parent/child relationships. There are several kinds: - {b Element nodes}: HTML tags like [
], [

], [] - {b Text nodes}: Text content within elements - {b Comment nodes}: HTML comments [] - {b Document nodes}: The root of a document tree - {b Document fragment nodes}: Lightweight containers - {b Doctype nodes}: The [] declaration See {!Dom} for manipulation functions. @see WHATWG: The DOM *) type node = Dom.node val pp_node : Format.formatter -> node -> unit (** Pretty-print a DOM node. Prints a summary representation showing the node type and key attributes. Does not recursively print children. *) (** DOCTYPE information. The DOCTYPE declaration ([]) appears at the start of HTML documents. It tells browsers to use standards mode for rendering. In HTML5, the DOCTYPE is minimal - just [] with no public or system identifiers. Legacy DOCTYPEs may have additional fields. @see WHATWG: The DOCTYPE *) type doctype_data = Dom.doctype_data = { name : string option; (** DOCTYPE name, typically ["html"] *) public_id : string option; (** Public identifier for legacy DOCTYPEs (e.g., XHTML, HTML4) *) system_id : string option; (** System identifier (URL) for legacy DOCTYPEs *) } val pp_doctype_data : Format.formatter -> doctype_data -> unit (** Pretty-print DOCTYPE data. *) (** Source location for nodes. Records the line and column where a node was found in the source HTML. The end position is optional for nodes like text that may span multiple locations. *) type location = Dom.location = { line : int; (** 1-indexed line number where the node starts *) column : int; (** 1-indexed column number where the node starts *) end_line : int option; (** Optional line number where the node ends *) end_column : int option; (** Optional column number where the node ends *) } val make_location : line:int -> column:int -> ?end_line:int -> ?end_column:int -> unit -> location (** Create a location. *) val get_location : node -> location option (** Get the source location for a node, if set. *) val set_location : node -> line:int -> column:int -> ?end_line:int -> ?end_column:int -> unit -> unit (** Set the source location for a node. *) (** Quirks mode as determined during parsing. {i Quirks mode} controls how browsers render CSS and compute layouts. It exists for backwards compatibility with old web pages that relied on browser bugs. - {b No_quirks}: Standards mode. The document is rendered according to modern HTML5 and CSS specifications. Triggered by []. - {b Quirks}: Full quirks mode. The browser emulates bugs from older browsers (primarily IE5). Triggered by missing or malformed DOCTYPEs. Affects CSS box model, table layout, font inheritance, and more. - {b Limited_quirks}: Almost standards mode. Only a few specific quirks are applied, mainly affecting table cell vertical alignment. {b Recommendation:} Always use [] to ensure standards mode. @see Quirks Mode Standard @see WHATWG: How quirks mode is determined *) type quirks_mode = Dom.quirks_mode = No_quirks | Quirks | Limited_quirks val pp_quirks_mode : Format.formatter -> quirks_mode -> unit (** Pretty-print quirks mode. *) (** Character encoding detected or specified. HTML documents are sequences of bytes that must be decoded into characters. Different encodings interpret the same bytes differently. For example: - UTF-8: The modern standard, supporting all Unicode characters - Windows-1252: Common on older Western European web pages - ISO-8859-2: Used for Central European languages - UTF-16: Used by some Windows applications The parser detects encoding automatically when using {!parse_bytes}. The detected encoding is available via {!val-encoding}. @see WHATWG: Determining the character encoding @see WHATWG Encoding Standard *) type encoding = Encoding.encoding = | Utf8 (** UTF-8: The dominant encoding for the web, supporting all Unicode *) | Utf16le (** UTF-16 Little-Endian: 16-bit encoding, used by Windows *) | Utf16be (** UTF-16 Big-Endian: 16-bit encoding, network byte order *) | Windows_1252 (** Windows-1252 (CP-1252): Western European, superset of ISO-8859-1 *) | Iso_8859_2 (** ISO-8859-2: Central European (Polish, Czech, Hungarian, etc.) *) | Euc_jp (** EUC-JP: Extended Unix Code for Japanese *) val pp_encoding : Format.formatter -> encoding -> unit (** Pretty-print an encoding using its canonical label. *) (** A parse error encountered during HTML5 parsing. HTML5 parsing {b never fails} - the specification defines error recovery for all malformed input. However, conformance checkers can report these errors. Enable error collection with [~collect_errors:true] if you want to detect malformed HTML. {b Common parse errors:} - ["unexpected-null-character"]: Null byte in the input - ["eof-before-tag-name"]: File ended while reading a tag - ["unexpected-character-in-attribute-name"]: Invalid attribute syntax - ["missing-doctype"]: Document started without [] - ["duplicate-attribute"]: Same attribute appears twice on an element The full list of parse error codes is defined in the WHATWG specification. @see WHATWG: Complete list of parse errors *) type parse_error = Parser.parse_error (** Get the error code. Returns the {!Parse_error_code.t} variant representing this error. This allows pattern matching on specific error types: {[ match Html5rw.error_code err with | Parse_error_code.Unexpected_null_character -> (* handle *) | Parse_error_code.Eof_in_tag -> (* handle *) | Parse_error_code.Tree_construction_error msg -> (* handle tree error *) | _ -> (* other *) ]} Use {!Parse_error_code.to_string} to convert to a string representation. @see WHATWG: Parse error codes *) val error_code : parse_error -> Parse_error_code.t (** Get the line number where the error occurred (1-indexed). Line numbers count from 1 and increment at each newline character. *) val error_line : parse_error -> int (** Get the column number where the error occurred (1-indexed). Column numbers count from 1 and reset at each newline. *) val error_column : parse_error -> int val pp_parse_error : Format.formatter -> parse_error -> unit (** Pretty-print a parse error with location information. *) (** {1 Error Handling} *) (** Global error type that wraps all errors raised by the Html5rw library. This module provides a unified error type for all parsing and selector errors, along with printers and conversion functions. Use this when you want to handle all possible errors from the library in a uniform way. {2 Usage} {[ (* Converting parse errors *) let errors = Html5rw.errors result in List.iter (fun err -> let unified = Html5rw.Error.of_parse_error err in Printf.eprintf "%s\n" (Html5rw.Error.to_string unified) ) errors (* Catching selector errors *) match Html5rw.query result selector with | nodes -> (* success *) | exception Html5rw.Selector.Selector_error code -> let unified = Html5rw.Error.of_selector_error code in Printf.eprintf "%s\n" (Html5rw.Error.to_string unified) ]} *) module Error : sig (** The unified error type for the Html5rw library. *) type t = | Parse_error of { code : Parse_error_code.t; line : int; column : int; } (** An HTML parse error, including location information. Parse errors occur during HTML tokenization and tree construction. The location indicates where in the input the error was detected. @see WHATWG: Parse errors *) | Selector_error of Selector.Error_code.t (** A CSS selector parse error. Selector errors occur when parsing malformed CSS selectors passed to {!query} or {!matches}. *) val of_parse_error : parse_error -> t (** Convert a parse error to the unified error type. {[ let errors = Html5rw.errors result in let unified_errors = List.map Html5rw.Error.of_parse_error errors ]} *) val of_selector_error : Selector.Error_code.t -> t (** Convert a selector error code to the unified error type. {[ match Html5rw.query result "invalid[" with | _ -> () | exception Html5rw.Selector.Selector_error code -> let err = Html5rw.Error.of_selector_error code in Printf.eprintf "%s\n" (Html5rw.Error.to_string err) ]} *) val to_string : t -> string (** Convert to a human-readable error message with location information. Examples: - ["Parse error at 5:12: unexpected-null-character"] - ["Selector error: Expected \]"] *) val pp : Format.formatter -> t -> unit (** Pretty-printer for use with [Format] functions. *) val code_string : t -> string (** Get just the error code as a kebab-case string (without location). This is useful for programmatic error handling or logging. Examples: - ["unexpected-null-character"] - ["expected-closing-bracket"] *) end (** {1 Fragment Parsing} *) (** Context element for HTML fragment parsing (innerHTML). When parsing HTML fragments (like the [innerHTML] of an element), you must specify what element would contain the fragment. This affects how the parser handles certain elements. {b Why context matters:} HTML parsing rules depend on where content appears. For example: - [] is valid inside [] but not inside [

] - [
  • ] is valid inside [