OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6(** Html5rw - Pure OCaml HTML5 Parser 7 8 This module provides a complete HTML5 parsing solution following the 9 WHATWG specification. It uses bytesrw for streaming input/output. 10 11 {2 Quick Start} 12 13 Parse HTML from a reader: 14 {[ 15 open Bytesrw 16 let reader = Bytes.Reader.of_string "<p>Hello, world!</p>" in 17 let result = Html5rw.parse reader in 18 let html = Html5rw.to_string result 19 ]} 20 21 Parse from a file: 22 {[ 23 open Bytesrw 24 let ic = open_in "page.html" in 25 let reader = Bytes.Reader.of_in_channel ic in 26 let result = Html5rw.parse reader in 27 close_in ic 28 ]} 29 30 Query with CSS selectors: 31 {[ 32 let result = Html5rw.parse reader in 33 let divs = Html5rw.query result "div.content" 34 ]} 35*) 36 37(** {1 Error Handling} *) 38 39(** Global error type that wraps all errors raised by the Html5rw library. 40 41 This provides a unified error type for all parsing and selector errors, 42 along with printers for display and debugging. 43*) 44module Error = struct 45 (** The unified error type for the Html5rw library. *) 46 type t = 47 | Parse_error of { 48 code : Parse_error_code.t; 49 line : int; 50 column : int; 51 } 52 (** An HTML parse error, including location information. *) 53 | Selector_error of Selector.Error_code.t 54 (** A CSS selector parse error. *) 55 56 let of_parse_error (err : Parser.parse_error) : t = 57 Parse_error { 58 code = Parser.error_code err; 59 line = Parser.error_line err; 60 column = Parser.error_column err; 61 } 62 63 let of_selector_error (code : Selector.Error_code.t) : t = 64 Selector_error code 65 66 let to_string = function 67 | Parse_error { code; line; column } -> 68 Printf.sprintf "Parse error at %d:%d: %s" line column 69 (Parse_error_code.to_string code) 70 | Selector_error code -> 71 Printf.sprintf "Selector error: %s" 72 (Selector.Error_code.to_human_string code) 73 74 let pp fmt err = Format.pp_print_string fmt (to_string err) 75 76 (** Get the error code as a kebab-case string. *) 77 let code_string = function 78 | Parse_error { code; _ } -> Parse_error_code.to_string code 79 | Selector_error code -> Selector.Error_code.to_string code 80end 81 82(** {1 Sub-modules} *) 83 84(** Parse error code types *) 85module Parse_error_code = Parse_error_code 86 87(** DOM types and manipulation functions *) 88module Dom = Dom 89 90(** HTML5 tokenizer *) 91module Tokenizer = Tokenizer 92 93(** Encoding detection and decoding *) 94module Encoding = Encoding 95 96(** CSS selector engine *) 97module Selector = Selector 98 99(** HTML entity decoding *) 100module Entities = Entities 101 102(** Low-level parser access *) 103module Parser = Parser 104 105(** {1 Core Types} *) 106 107(** DOM node type. See {!Dom} for manipulation functions. *) 108type node = Dom.node 109 110let pp_node = Dom.pp 111 112(** Doctype information *) 113type doctype_data = Dom.doctype_data = { 114 name : string option; 115 public_id : string option; 116 system_id : string option; 117} 118 119let pp_doctype_data = Dom.pp_doctype_data 120 121(** Quirks mode as determined during parsing *) 122type quirks_mode = Dom.quirks_mode = No_quirks | Quirks | Limited_quirks 123 124let pp_quirks_mode = Dom.pp_quirks_mode 125 126(** Character encoding detected or specified *) 127type encoding = Encoding.encoding = 128 | Utf8 129 | Utf16le 130 | Utf16be 131 | Windows_1252 132 | Iso_8859_2 133 | Euc_jp 134 135let pp_encoding = Encoding.pp 136 137(** Parse error record *) 138type parse_error = Parser.parse_error 139 140(** Fragment parsing context *) 141type fragment_context = Parser.fragment_context 142 143(** Create a fragment parsing context. 144 @param tag_name Tag name of the context element 145 @param namespace Namespace (None for HTML, Some "svg", Some "mathml") 146*) 147let make_fragment_context = Parser.make_fragment_context 148 149(** Get the tag name from a fragment context *) 150let fragment_context_tag = Parser.fragment_context_tag 151 152(** Get the namespace from a fragment context *) 153let fragment_context_namespace = Parser.fragment_context_namespace 154 155let pp_fragment_context = Parser.pp_fragment_context 156 157(** Get the error code *) 158let error_code = Parser.error_code 159 160(** Get the line number of an error (1-indexed) *) 161let error_line = Parser.error_line 162 163(** Get the column number of an error (1-indexed) *) 164let error_column = Parser.error_column 165 166let pp_parse_error = Parser.pp_parse_error 167 168(** Result of parsing an HTML document *) 169type t = { 170 root : node; 171 errors : parse_error list; 172 encoding : encoding option; 173} 174 175let pp fmt t = 176 Format.fprintf fmt "{root=%a; errors=%d; encoding=%a}" 177 pp_node t.root 178 (List.length t.errors) 179 (Format.pp_print_option pp_encoding) t.encoding 180 181(* Internal: convert Parser.t to our t *) 182let of_parser_result (p : Parser.t) : t = 183 { root = Parser.root p; errors = Parser.errors p; encoding = Parser.encoding p } 184 185(** {1 Parsing Functions} *) 186 187(** Parse HTML from a [Bytes.Reader.t]. 188 189 This is the primary parsing function. Create a reader from any source: 190 - [Bytes.Reader.of_string s] for strings 191 - [Bytes.Reader.of_in_channel ic] for files 192 - [Bytes.Reader.of_bytes b] for byte buffers 193 194 {[ 195 open Bytesrw 196 let reader = Bytes.Reader.of_string "<html><body>Hello</body></html>" in 197 let result = Html5rw.parse reader 198 ]} 199 200 @param collect_errors If true, collect parse errors (default: false) 201 @param fragment_context Context element for fragment parsing 202*) 203let parse ?collect_errors ?fragment_context reader = 204 of_parser_result (Parser.parse ?collect_errors ?fragment_context reader) 205 206(** Parse raw bytes with automatic encoding detection. 207 208 This function implements the WHATWG encoding sniffing algorithm: 209 1. Check for BOM (Byte Order Mark) 210 2. Prescan for <meta charset> 211 3. Fall back to UTF-8 212 213 @param collect_errors If true, collect parse errors (default: false) 214 @param transport_encoding Encoding from HTTP Content-Type header 215 @param fragment_context Context element for fragment parsing 216*) 217let parse_bytes ?collect_errors ?transport_encoding ?fragment_context bytes = 218 of_parser_result (Parser.parse_bytes ?collect_errors ?transport_encoding ?fragment_context bytes) 219 220(** {1 Querying} *) 221 222(** Query the DOM tree with a CSS selector. 223 224 Supported selectors: 225 - Tag: [div], [p], [span] 226 - ID: [#myid] 227 - Class: [.myclass] 228 - Universal: [*] 229 - Attribute: [[attr]], [[attr="value"]], [[attr~="value"]], [[attr|="value"]] 230 - Pseudo-classes: [:first-child], [:last-child], [:nth-child(n)] 231 - Combinators: descendant (space), child (>), adjacent sibling (+), general sibling (~) 232 233 {[ 234 let divs = Html5rw.query result "div.content > p" 235 ]} 236 237 @raise Selector.Selector_error if the selector is invalid 238*) 239let query t selector = Selector.query t.root selector 240 241(** Check if a node matches a CSS selector. *) 242let matches node selector = Selector.matches node selector 243 244(** {1 Serialization} *) 245 246(** Write the DOM tree to a [Bytes.Writer.t]. 247 248 {[ 249 open Bytesrw 250 let buf = Buffer.create 1024 in 251 let writer = Bytes.Writer.of_buffer buf in 252 Html5rw.to_writer result writer; 253 Bytes.Writer.write_eod writer; 254 let html = Buffer.contents buf 255 ]} 256 257 @param pretty If true, format with indentation (default: true) 258 @param indent_size Number of spaces per indent level (default: 2) 259*) 260let to_writer ?pretty ?indent_size t writer = 261 Dom.to_writer ?pretty ?indent_size writer t.root 262 263(** Serialize the DOM tree to a string. 264 265 Convenience function when the output fits in memory. 266 267 @param pretty If true, format with indentation (default: true) 268 @param indent_size Number of spaces per indent level (default: 2) 269*) 270let to_string ?pretty ?indent_size t = Dom.to_html ?pretty ?indent_size t.root 271 272(** Extract text content from the DOM tree. 273 274 @param separator String to insert between text nodes (default: " ") 275 @param strip If true, trim whitespace (default: true) 276*) 277let to_text ?separator ?strip t = Dom.to_text ?separator ?strip t.root 278 279(** Serialize to html5lib test format (for testing). *) 280let to_test_format t = Dom.to_test_format t.root 281 282(** {1 Result Accessors} *) 283 284(** Get the root node of the parsed document. *) 285let root t = t.root 286 287(** Get parse errors (if error collection was enabled). *) 288let errors t = t.errors 289 290(** Get the detected encoding (if parsed from bytes). *) 291let encoding t = t.encoding 292 293(** {1 DOM Utilities} 294 295 Common DOM operations are available directly. For the full API, 296 see the {!Dom} module. 297*) 298 299(** Create an element node. 300 @param namespace None for HTML, Some "svg" or Some "mathml" for foreign content 301 @param attrs List of (name, value) attribute pairs 302*) 303let create_element = Dom.create_element 304 305(** Create a text node. *) 306let create_text = Dom.create_text 307 308(** Create a comment node. *) 309let create_comment = Dom.create_comment 310 311(** Create an empty document node. *) 312let create_document = Dom.create_document 313 314(** Create a document fragment node. *) 315let create_document_fragment = Dom.create_document_fragment 316 317(** Create a doctype node. *) 318let create_doctype = Dom.create_doctype 319 320(** Append a child node to a parent. *) 321let append_child = Dom.append_child 322 323(** Insert a node before a reference node. *) 324let insert_before = Dom.insert_before 325 326(** Remove a child node from its parent. *) 327let remove_child = Dom.remove_child 328 329(** Get an attribute value. *) 330let get_attr = Dom.get_attr 331 332(** Set an attribute value. *) 333let set_attr = Dom.set_attr 334 335(** Check if a node has an attribute. *) 336let has_attr = Dom.has_attr 337 338(** Get all descendant nodes. *) 339let descendants = Dom.descendants 340 341(** Get all ancestor nodes (from parent to root). *) 342let ancestors = Dom.ancestors 343 344(** Get text content of a node and its descendants. *) 345let get_text_content = Dom.get_text_content 346 347(** Clone a node. 348 @param deep If true, also clone descendants (default: false) 349*) 350let clone = Dom.clone 351 352(** {1 Node Predicates} *) 353 354(** Test if a node is an element. *) 355let is_element = Dom.is_element 356 357(** Test if a node is a text node. *) 358let is_text = Dom.is_text 359 360(** Test if a node is a comment node. *) 361let is_comment = Dom.is_comment 362 363(** Test if a node is a document node. *) 364let is_document = Dom.is_document 365 366(** Test if a node is a document fragment. *) 367let is_document_fragment = Dom.is_document_fragment 368 369(** Test if a node is a doctype node. *) 370let is_doctype = Dom.is_doctype 371 372(** Test if a node has children. *) 373let has_children = Dom.has_children