OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 1.9 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6(** XHTML parser using xmlm for proper XML parsing. 7 8 This module provides XML parsing for XHTML files. While the HTML5 parser 9 handles most content, XHTML requires proper XML parsing to correctly handle: 10 11 - Self-closing tags on non-void elements (e.g., [<div/>]) 12 - XML namespaces for SVG and MathML 13 - Strict XML well-formedness requirements 14 15 {2 Usage} 16 17 {[ 18 if Xhtml_parser.is_xhtml_file (Some "page.xhtml") then 19 match Xhtml_parser.parse_xhtml content with 20 | Ok doc -> (* Process XHTML document *) 21 | Error msg -> (* Handle parse error *) 22 ]} 23*) 24 25(** {1 Types} *) 26 27type xhtml_doc = { 28 root : Html5rw.Dom.node; 29 (** The document root node. *) 30 errors : Html5rw.Error.t list; 31 (** Parse errors (empty for valid XML). *) 32} 33(** An XHTML document representation. *) 34 35(** {1 Parsing} *) 36 37val parse_xhtml : string -> (Html5rw.Dom.node, string) result 38(** [parse_xhtml content] parses XHTML content using xmlm. 39 40 @param content The XHTML content as a string 41 @return [Ok root] with the document root on success, 42 [Error message] with parse error details on failure *) 43 44val is_xhtml_file : string option -> bool 45(** [is_xhtml_file system_id] checks if a system_id indicates an XHTML file. 46 47 @param system_id The optional file path or identifier 48 @return [true] if the path ends with ".xhtml" *) 49 50(** {1 Document Access} *) 51 52val xhtml_root : xhtml_doc -> Html5rw.Dom.node 53(** [xhtml_root doc] returns the document root node. *) 54 55val xhtml_errors : xhtml_doc -> Html5rw.Error.t list 56(** [xhtml_errors doc] returns the parse errors (always empty for XHTML). *)