(*--------------------------------------------------------------------------- Copyright (c) 2025 Anil Madhavapeddy . All rights reserved. SPDX-License-Identifier: MIT ---------------------------------------------------------------------------*) (** Html5rw - Pure OCaml HTML5 Parser This module provides a complete HTML5 parsing solution following the WHATWG specification. It uses bytesrw for streaming input/output. {2 Quick Start} Parse HTML from a reader: {[ open Bytesrw let reader = Bytes.Reader.of_string "

Hello, world!

" in let result = Html5rw.parse reader in let html = Html5rw.to_string result ]} Parse from a file: {[ open Bytesrw let ic = open_in "page.html" in let reader = Bytes.Reader.of_in_channel ic in let result = Html5rw.parse reader in close_in ic ]} Query with CSS selectors: {[ let result = Html5rw.parse reader in let divs = Html5rw.query result "div.content" ]} *) (** {1 Error Handling} *) (** Global error type that wraps all errors raised by the Html5rw library. This provides a unified error type for all parsing and selector errors, along with printers for display and debugging. *) module Error = struct (** The unified error type for the Html5rw library. *) type t = | Parse_error of { code : Parse_error_code.t; line : int; column : int; } (** An HTML parse error, including location information. *) | Selector_error of Selector.Error_code.t (** A CSS selector parse error. *) let of_parse_error (err : Parser.parse_error) : t = Parse_error { code = Parser.error_code err; line = Parser.error_line err; column = Parser.error_column err; } let of_selector_error (code : Selector.Error_code.t) : t = Selector_error code let to_string = function | Parse_error { code; line; column } -> Printf.sprintf "Parse error at %d:%d: %s" line column (Parse_error_code.to_string code) | Selector_error code -> Printf.sprintf "Selector error: %s" (Selector.Error_code.to_human_string code) let pp fmt err = Format.pp_print_string fmt (to_string err) (** Get the error code as a kebab-case string. *) let code_string = function | Parse_error { code; _ } -> Parse_error_code.to_string code | Selector_error code -> Selector.Error_code.to_string code end (** {1 Sub-modules} *) (** Parse error code types *) module Parse_error_code = Parse_error_code (** DOM types and manipulation functions *) module Dom = Dom (** HTML5 tokenizer *) module Tokenizer = Tokenizer (** Encoding detection and decoding *) module Encoding = Encoding (** CSS selector engine *) module Selector = Selector (** HTML entity decoding *) module Entities = Entities (** Low-level parser access *) module Parser = Parser (** {1 Core Types} *) (** DOM node type. See {!Dom} for manipulation functions. *) type node = Dom.node let pp_node = Dom.pp (** Doctype information *) type doctype_data = Dom.doctype_data = { name : string option; public_id : string option; system_id : string option; } let pp_doctype_data = Dom.pp_doctype_data (** Source location for nodes *) type location = Dom.location = { line : int; column : int; end_line : int option; end_column : int option; } let make_location = Dom.make_location let get_location = Dom.get_location let set_location = Dom.set_location (** Quirks mode as determined during parsing *) type quirks_mode = Dom.quirks_mode = No_quirks | Quirks | Limited_quirks let pp_quirks_mode = Dom.pp_quirks_mode (** Character encoding detected or specified *) type encoding = Encoding.encoding = | Utf8 | Utf16le | Utf16be | Windows_1252 | Iso_8859_2 | Euc_jp let pp_encoding = Encoding.pp (** Parse error record *) type parse_error = Parser.parse_error (** Fragment parsing context *) type fragment_context = Parser.fragment_context (** Create a fragment parsing context. @param tag_name Tag name of the context element @param namespace Namespace (None for HTML, Some "svg", Some "mathml") *) let make_fragment_context = Parser.make_fragment_context (** Get the tag name from a fragment context *) let fragment_context_tag = Parser.fragment_context_tag (** Get the namespace from a fragment context *) let fragment_context_namespace = Parser.fragment_context_namespace let pp_fragment_context = Parser.pp_fragment_context (** Get the error code *) let error_code = Parser.error_code (** Get the line number of an error (1-indexed) *) let error_line = Parser.error_line (** Get the column number of an error (1-indexed) *) let error_column = Parser.error_column let pp_parse_error = Parser.pp_parse_error (** Result of parsing an HTML document *) type t = { root : node; errors : parse_error list; encoding : encoding option; } let pp fmt t = Format.fprintf fmt "{root=%a; errors=%d; encoding=%a}" pp_node t.root (List.length t.errors) (Format.pp_print_option pp_encoding) t.encoding (* Internal: convert Parser.t to our t *) let of_parser_result (p : Parser.t) : t = { root = Parser.root p; errors = Parser.errors p; encoding = Parser.encoding p } (** {1 Parsing Functions} *) (** Parse HTML from a [Bytes.Reader.t]. This is the primary parsing function. Create a reader from any source: - [Bytes.Reader.of_string s] for strings - [Bytes.Reader.of_in_channel ic] for files - [Bytes.Reader.of_bytes b] for byte buffers {[ open Bytesrw let reader = Bytes.Reader.of_string "Hello" in let result = Html5rw.parse reader ]} @param collect_errors If true, collect parse errors (default: false) @param fragment_context Context element for fragment parsing *) let parse ?collect_errors ?fragment_context reader = of_parser_result (Parser.parse ?collect_errors ?fragment_context reader) (** Parse raw bytes with automatic encoding detection. This function implements the WHATWG encoding sniffing algorithm: 1. Check for BOM (Byte Order Mark) 2. Prescan for 3. Fall back to UTF-8 @param collect_errors If true, collect parse errors (default: false) @param transport_encoding Encoding from HTTP Content-Type header @param fragment_context Context element for fragment parsing *) let parse_bytes ?collect_errors ?transport_encoding ?fragment_context bytes = of_parser_result (Parser.parse_bytes ?collect_errors ?transport_encoding ?fragment_context bytes) (** {1 Querying} *) (** Query the DOM tree with a CSS selector. Supported selectors: - Tag: [div], [p], [span] - ID: [#myid] - Class: [.myclass] - Universal: [*] - Attribute: [[attr]], [[attr="value"]], [[attr~="value"]], [[attr|="value"]] - Pseudo-classes: [:first-child], [:last-child], [:nth-child(n)] - Combinators: descendant (space), child (>), adjacent sibling (+), general sibling (~) {[ let divs = Html5rw.query result "div.content > p" ]} @raise Selector.Selector_error if the selector is invalid *) let query t selector = Selector.query t.root selector (** Check if a node matches a CSS selector. *) let matches node selector = Selector.matches node selector (** {1 Serialization} *) (** Write the DOM tree to a [Bytes.Writer.t]. {[ open Bytesrw let buf = Buffer.create 1024 in let writer = Bytes.Writer.of_buffer buf in Html5rw.to_writer result writer; Bytes.Writer.write_eod writer; let html = Buffer.contents buf ]} @param pretty If true, format with indentation (default: true) @param indent_size Number of spaces per indent level (default: 2) *) let to_writer ?pretty ?indent_size t writer = Dom.to_writer ?pretty ?indent_size writer t.root (** Serialize the DOM tree to a string. Convenience function when the output fits in memory. @param pretty If true, format with indentation (default: true) @param indent_size Number of spaces per indent level (default: 2) *) let to_string ?pretty ?indent_size t = Dom.to_html ?pretty ?indent_size t.root (** Extract text content from the DOM tree. @param separator String to insert between text nodes (default: " ") @param strip If true, trim whitespace (default: true) *) let to_text ?separator ?strip t = Dom.to_text ?separator ?strip t.root (** Serialize to html5lib test format (for testing). *) let to_test_format t = Dom.to_test_format t.root (** {1 Result Accessors} *) (** Get the root node of the parsed document. *) let root t = t.root (** Get parse errors (if error collection was enabled). *) let errors t = t.errors (** Get the detected encoding (if parsed from bytes). *) let encoding t = t.encoding (** {1 DOM Utilities} Common DOM operations are available directly. For the full API, see the {!Dom} module. *) (** Create an element node. @param namespace None for HTML, Some "svg" or Some "mathml" for foreign content @param attrs List of (name, value) attribute pairs *) let create_element = Dom.create_element (** Create a text node. *) let create_text = Dom.create_text (** Create a comment node. *) let create_comment = Dom.create_comment (** Create an empty document node. *) let create_document = Dom.create_document (** Create a document fragment node. *) let create_document_fragment = Dom.create_document_fragment (** Create a doctype node. *) let create_doctype = Dom.create_doctype (** Append a child node to a parent. *) let append_child = Dom.append_child (** Insert a node before a reference node. *) let insert_before = Dom.insert_before (** Remove a child node from its parent. *) let remove_child = Dom.remove_child (** Get an attribute value. *) let get_attr = Dom.get_attr (** Set an attribute value. *) let set_attr = Dom.set_attr (** Check if a node has an attribute. *) let has_attr = Dom.has_attr (** Get all descendant nodes. *) let descendants = Dom.descendants (** Get all ancestor nodes (from parent to root). *) let ancestors = Dom.ancestors (** Get text content of a node and its descendants. *) let get_text_content = Dom.get_text_content (** Clone a node. @param deep If true, also clone descendants (default: false) *) let clone = Dom.clone (** {1 Node Predicates} *) (** Test if a node is an element. *) let is_element = Dom.is_element (** Test if a node is a text node. *) let is_text = Dom.is_text (** Test if a node is a comment node. *) let is_comment = Dom.is_comment (** Test if a node is a document node. *) let is_document = Dom.is_document (** Test if a node is a document fragment. *) let is_document_fragment = Dom.is_document_fragment (** Test if a node is a doctype node. *) let is_doctype = Dom.is_doctype (** Test if a node has children. *) let has_children = Dom.has_children