OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** XHTML parser using xmlm for proper XML parsing.
7
8 This module provides XML parsing for XHTML files. While the HTML5 parser
9 handles most content, XHTML requires proper XML parsing to correctly handle:
10
11 - Self-closing tags on non-void elements (e.g., [<div/>])
12 - XML namespaces for SVG and MathML
13 - Strict XML well-formedness requirements
14
15 {2 Usage}
16
17 {[
18 if Xhtml_parser.is_xhtml_file (Some "page.xhtml") then
19 match Xhtml_parser.parse_xhtml content with
20 | Ok doc -> (* Process XHTML document *)
21 | Error msg -> (* Handle parse error *)
22 ]}
23*)
24
25(** {1 Types} *)
26
27type xhtml_doc = {
28 root : Html5rw.Dom.node;
29 (** The document root node. *)
30 errors : Html5rw.Error.t list;
31 (** Parse errors (empty for valid XML). *)
32}
33(** An XHTML document representation. *)
34
35(** {1 Parsing} *)
36
37val parse_xhtml : string -> (Html5rw.Dom.node, string) result
38(** [parse_xhtml content] parses XHTML content using xmlm.
39
40 @param content The XHTML content as a string
41 @return [Ok root] with the document root on success,
42 [Error message] with parse error details on failure *)
43
44val is_xhtml_file : string option -> bool
45(** [is_xhtml_file system_id] checks if a system_id indicates an XHTML file.
46
47 @param system_id The optional file path or identifier
48 @return [true] if the path ends with ".xhtml" *)
49
50(** {1 Document Access} *)
51
52val xhtml_root : xhtml_doc -> Html5rw.Dom.node
53(** [xhtml_root doc] returns the document root node. *)
54
55val xhtml_errors : xhtml_doc -> Html5rw.Error.t list
56(** [xhtml_errors doc] returns the parse errors (always empty for XHTML). *)