lib/check/xhtml_parser.mli at main · anil.recoil.org/ocaml-html5rw

OCaml HTML5 parser/serialiser based on Python's JustHTML
ocaml-html5rw / lib / check / xhtml_parser.mli
at main 1.9 kB view raw
 1(*---------------------------------------------------------------------------
 2  Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
 3  SPDX-License-Identifier: MIT
 4 ---------------------------------------------------------------------------*)
 5
 6(** XHTML parser using xmlm for proper XML parsing.
 7
 8    This module provides XML parsing for XHTML files. While the HTML5 parser
 9    handles most content, XHTML requires proper XML parsing to correctly handle:
10
11    - Self-closing tags on non-void elements (e.g., [<div/>])
12    - XML namespaces for SVG and MathML
13    - Strict XML well-formedness requirements
14
15    {2 Usage}
16
17    {[
18      if Xhtml_parser.is_xhtml_file (Some "page.xhtml") then
19        match Xhtml_parser.parse_xhtml content with
20        | Ok doc -> (* Process XHTML document *)
21        | Error msg -> (* Handle parse error *)
22    ]}
23*)
24
25(** {1 Types} *)
26
27type xhtml_doc = {
28  root : Html5rw.Dom.node;
29  (** The document root node. *)
30  errors : Html5rw.Error.t list;
31  (** Parse errors (empty for valid XML). *)
32}
33(** An XHTML document representation. *)
34
35(** {1 Parsing} *)
36
37val parse_xhtml : string -> (Html5rw.Dom.node, string) result
38(** [parse_xhtml content] parses XHTML content using xmlm.
39
40    @param content The XHTML content as a string
41    @return [Ok root] with the document root on success,
42            [Error message] with parse error details on failure *)
43
44val is_xhtml_file : string option -> bool
45(** [is_xhtml_file system_id] checks if a system_id indicates an XHTML file.
46
47    @param system_id The optional file path or identifier
48    @return [true] if the path ends with ".xhtml" *)
49
50(** {1 Document Access} *)
51
52val xhtml_root : xhtml_doc -> Html5rw.Dom.node
53(** [xhtml_root doc] returns the document root node. *)
54
55val xhtml_errors : xhtml_doc -> Html5rw.Error.t list
56(** [xhtml_errors doc] returns the parse errors (always empty for XHTML). *)