OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 11 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6(** Html5rw - Pure OCaml HTML5 Parser 7 8 This module provides a complete HTML5 parsing solution following the 9 WHATWG specification. It uses bytesrw for streaming input/output. 10 11 {2 Quick Start} 12 13 Parse HTML from a reader: 14 {[ 15 open Bytesrw 16 let reader = Bytes.Reader.of_string "<p>Hello, world!</p>" in 17 let result = Html5rw.parse reader in 18 let html = Html5rw.to_string result 19 ]} 20 21 Parse from a file: 22 {[ 23 open Bytesrw 24 let ic = open_in "page.html" in 25 let reader = Bytes.Reader.of_in_channel ic in 26 let result = Html5rw.parse reader in 27 close_in ic 28 ]} 29 30 Query with CSS selectors: 31 {[ 32 let result = Html5rw.parse reader in 33 let divs = Html5rw.query result "div.content" 34 ]} 35*) 36 37(** {1 Error Handling} *) 38 39(** Global error type that wraps all errors raised by the Html5rw library. 40 41 This provides a unified error type for all parsing and selector errors, 42 along with printers for display and debugging. 43*) 44module Error = struct 45 (** The unified error type for the Html5rw library. *) 46 type t = 47 | Parse_error of { 48 code : Parse_error_code.t; 49 line : int; 50 column : int; 51 } 52 (** An HTML parse error, including location information. *) 53 | Selector_error of Selector.Error_code.t 54 (** A CSS selector parse error. *) 55 56 let of_parse_error (err : Parser.parse_error) : t = 57 Parse_error { 58 code = Parser.error_code err; 59 line = Parser.error_line err; 60 column = Parser.error_column err; 61 } 62 63 let of_selector_error (code : Selector.Error_code.t) : t = 64 Selector_error code 65 66 let to_string = function 67 | Parse_error { code; line; column } -> 68 Printf.sprintf "Parse error at %d:%d: %s" line column 69 (Parse_error_code.to_string code) 70 | Selector_error code -> 71 Printf.sprintf "Selector error: %s" 72 (Selector.Error_code.to_human_string code) 73 74 let pp fmt err = Format.pp_print_string fmt (to_string err) 75 76 (** Get the error code as a kebab-case string. *) 77 let code_string = function 78 | Parse_error { code; _ } -> Parse_error_code.to_string code 79 | Selector_error code -> Selector.Error_code.to_string code 80end 81 82(** {1 Sub-modules} *) 83 84(** Parse error code types *) 85module Parse_error_code = Parse_error_code 86 87(** DOM types and manipulation functions *) 88module Dom = Dom 89 90(** HTML5 tokenizer *) 91module Tokenizer = Tokenizer 92 93(** Encoding detection and decoding *) 94module Encoding = Encoding 95 96(** CSS selector engine *) 97module Selector = Selector 98 99(** HTML entity decoding *) 100module Entities = Entities 101 102(** Low-level parser access *) 103module Parser = Parser 104 105(** {1 Core Types} *) 106 107(** DOM node type. See {!Dom} for manipulation functions. *) 108type node = Dom.node 109 110let pp_node = Dom.pp 111 112(** Doctype information *) 113type doctype_data = Dom.doctype_data = { 114 name : string option; 115 public_id : string option; 116 system_id : string option; 117} 118 119let pp_doctype_data = Dom.pp_doctype_data 120 121(** Source location for nodes *) 122type location = Dom.location = { 123 line : int; 124 column : int; 125 end_line : int option; 126 end_column : int option; 127} 128 129let make_location = Dom.make_location 130let get_location = Dom.get_location 131let set_location = Dom.set_location 132 133(** Quirks mode as determined during parsing *) 134type quirks_mode = Dom.quirks_mode = No_quirks | Quirks | Limited_quirks 135 136let pp_quirks_mode = Dom.pp_quirks_mode 137 138(** Character encoding detected or specified *) 139type encoding = Encoding.encoding = 140 | Utf8 141 | Utf16le 142 | Utf16be 143 | Windows_1252 144 | Iso_8859_2 145 | Euc_jp 146 147let pp_encoding = Encoding.pp 148 149(** Parse error record *) 150type parse_error = Parser.parse_error 151 152(** Fragment parsing context *) 153type fragment_context = Parser.fragment_context 154 155(** Create a fragment parsing context. 156 @param tag_name Tag name of the context element 157 @param namespace Namespace (None for HTML, Some "svg", Some "mathml") 158*) 159let make_fragment_context = Parser.make_fragment_context 160 161(** Get the tag name from a fragment context *) 162let fragment_context_tag = Parser.fragment_context_tag 163 164(** Get the namespace from a fragment context *) 165let fragment_context_namespace = Parser.fragment_context_namespace 166 167let pp_fragment_context = Parser.pp_fragment_context 168 169(** Get the error code *) 170let error_code = Parser.error_code 171 172(** Get the line number of an error (1-indexed) *) 173let error_line = Parser.error_line 174 175(** Get the column number of an error (1-indexed) *) 176let error_column = Parser.error_column 177 178let pp_parse_error = Parser.pp_parse_error 179 180(** Result of parsing an HTML document *) 181type t = { 182 root : node; 183 errors : parse_error list; 184 encoding : encoding option; 185} 186 187let pp fmt t = 188 Format.fprintf fmt "{root=%a; errors=%d; encoding=%a}" 189 pp_node t.root 190 (List.length t.errors) 191 (Format.pp_print_option pp_encoding) t.encoding 192 193(* Internal: convert Parser.t to our t *) 194let of_parser_result (p : Parser.t) : t = 195 { root = Parser.root p; errors = Parser.errors p; encoding = Parser.encoding p } 196 197(** {1 Parsing Functions} *) 198 199(** Parse HTML from a [Bytes.Reader.t]. 200 201 This is the primary parsing function. Create a reader from any source: 202 - [Bytes.Reader.of_string s] for strings 203 - [Bytes.Reader.of_in_channel ic] for files 204 - [Bytes.Reader.of_bytes b] for byte buffers 205 206 {[ 207 open Bytesrw 208 let reader = Bytes.Reader.of_string "<html><body>Hello</body></html>" in 209 let result = Html5rw.parse reader 210 ]} 211 212 @param collect_errors If true, collect parse errors (default: false) 213 @param fragment_context Context element for fragment parsing 214*) 215let parse ?collect_errors ?fragment_context reader = 216 of_parser_result (Parser.parse ?collect_errors ?fragment_context reader) 217 218(** Parse raw bytes with automatic encoding detection. 219 220 This function implements the WHATWG encoding sniffing algorithm: 221 1. Check for BOM (Byte Order Mark) 222 2. Prescan for <meta charset> 223 3. Fall back to UTF-8 224 225 @param collect_errors If true, collect parse errors (default: false) 226 @param transport_encoding Encoding from HTTP Content-Type header 227 @param fragment_context Context element for fragment parsing 228*) 229let parse_bytes ?collect_errors ?transport_encoding ?fragment_context bytes = 230 of_parser_result (Parser.parse_bytes ?collect_errors ?transport_encoding ?fragment_context bytes) 231 232(** {1 Querying} *) 233 234(** Query the DOM tree with a CSS selector. 235 236 Supported selectors: 237 - Tag: [div], [p], [span] 238 - ID: [#myid] 239 - Class: [.myclass] 240 - Universal: [*] 241 - Attribute: [[attr]], [[attr="value"]], [[attr~="value"]], [[attr|="value"]] 242 - Pseudo-classes: [:first-child], [:last-child], [:nth-child(n)] 243 - Combinators: descendant (space), child (>), adjacent sibling (+), general sibling (~) 244 245 {[ 246 let divs = Html5rw.query result "div.content > p" 247 ]} 248 249 @raise Selector.Selector_error if the selector is invalid 250*) 251let query t selector = Selector.query t.root selector 252 253(** Check if a node matches a CSS selector. *) 254let matches node selector = Selector.matches node selector 255 256(** {1 Serialization} *) 257 258(** Write the DOM tree to a [Bytes.Writer.t]. 259 260 {[ 261 open Bytesrw 262 let buf = Buffer.create 1024 in 263 let writer = Bytes.Writer.of_buffer buf in 264 Html5rw.to_writer result writer; 265 Bytes.Writer.write_eod writer; 266 let html = Buffer.contents buf 267 ]} 268 269 @param pretty If true, format with indentation (default: true) 270 @param indent_size Number of spaces per indent level (default: 2) 271*) 272let to_writer ?pretty ?indent_size t writer = 273 Dom.to_writer ?pretty ?indent_size writer t.root 274 275(** Serialize the DOM tree to a string. 276 277 Convenience function when the output fits in memory. 278 279 @param pretty If true, format with indentation (default: true) 280 @param indent_size Number of spaces per indent level (default: 2) 281*) 282let to_string ?pretty ?indent_size t = Dom.to_html ?pretty ?indent_size t.root 283 284(** Extract text content from the DOM tree. 285 286 @param separator String to insert between text nodes (default: " ") 287 @param strip If true, trim whitespace (default: true) 288*) 289let to_text ?separator ?strip t = Dom.to_text ?separator ?strip t.root 290 291(** Serialize to html5lib test format (for testing). *) 292let to_test_format t = Dom.to_test_format t.root 293 294(** {1 Result Accessors} *) 295 296(** Get the root node of the parsed document. *) 297let root t = t.root 298 299(** Get parse errors (if error collection was enabled). *) 300let errors t = t.errors 301 302(** Get the detected encoding (if parsed from bytes). *) 303let encoding t = t.encoding 304 305(** {1 DOM Utilities} 306 307 Common DOM operations are available directly. For the full API, 308 see the {!Dom} module. 309*) 310 311(** Create an element node. 312 @param namespace None for HTML, Some "svg" or Some "mathml" for foreign content 313 @param attrs List of (name, value) attribute pairs 314*) 315let create_element = Dom.create_element 316 317(** Create a text node. *) 318let create_text = Dom.create_text 319 320(** Create a comment node. *) 321let create_comment = Dom.create_comment 322 323(** Create an empty document node. *) 324let create_document = Dom.create_document 325 326(** Create a document fragment node. *) 327let create_document_fragment = Dom.create_document_fragment 328 329(** Create a doctype node. *) 330let create_doctype = Dom.create_doctype 331 332(** Append a child node to a parent. *) 333let append_child = Dom.append_child 334 335(** Insert a node before a reference node. *) 336let insert_before = Dom.insert_before 337 338(** Remove a child node from its parent. *) 339let remove_child = Dom.remove_child 340 341(** Get an attribute value. *) 342let get_attr = Dom.get_attr 343 344(** Set an attribute value. *) 345let set_attr = Dom.set_attr 346 347(** Check if a node has an attribute. *) 348let has_attr = Dom.has_attr 349 350(** Get all descendant nodes. *) 351let descendants = Dom.descendants 352 353(** Get all ancestor nodes (from parent to root). *) 354let ancestors = Dom.ancestors 355 356(** Get text content of a node and its descendants. *) 357let get_text_content = Dom.get_text_content 358 359(** Clone a node. 360 @param deep If true, also clone descendants (default: false) 361*) 362let clone = Dom.clone 363 364(** {1 Node Predicates} *) 365 366(** Test if a node is an element. *) 367let is_element = Dom.is_element 368 369(** Test if a node is a text node. *) 370let is_text = Dom.is_text 371 372(** Test if a node is a comment node. *) 373let is_comment = Dom.is_comment 374 375(** Test if a node is a document node. *) 376let is_document = Dom.is_document 377 378(** Test if a node is a document fragment. *) 379let is_document_fragment = Dom.is_document_fragment 380 381(** Test if a node is a doctype node. *) 382let is_doctype = Dom.is_doctype 383 384(** Test if a node has children. *) 385let has_children = Dom.has_children