OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** XHTML parser using xmlm for proper XML parsing.
2
3 This module provides XML parsing for XHTML files, which the HTML5 parser
4 cannot handle correctly (especially self-closing tags on non-void elements). *)
5
6(** Parse XHTML content using xmlm and return a DOM tree. *)
7let parse_xhtml content =
8 let input = Xmlm.make_input (`String (0, content)) in
9
10 (* Stack of nodes during parsing *)
11 let stack = ref [] in
12 let root = Html5rw.Dom.create_document () in
13 stack := [root];
14
15 (* Helper to get namespace shorthand *)
16 let ns_shorthand ns =
17 if ns = "http://www.w3.org/2000/svg" then Some "svg"
18 else if ns = "http://www.w3.org/1998/Math/MathML" then Some "mathml"
19 else if ns = "http://www.w3.org/1999/xhtml" then None (* HTML namespace *)
20 else if ns = "" then None (* No namespace = HTML *)
21 else Some ns (* Keep other namespaces as-is *)
22 in
23
24 (* Process xmlm signals *)
25 let rec process () =
26 if Xmlm.eoi input then ()
27 else begin
28 match Xmlm.input input with
29 | `Dtd _ ->
30 (* Skip DTD for now *)
31 process ()
32 | `El_start ((ns, local), attrs) ->
33 (* Create element node *)
34 let namespace = ns_shorthand ns in
35 let attr_list = List.map (fun ((_, aname), aval) -> (aname, aval)) attrs in
36 let node = Html5rw.Dom.create_element local ~namespace ~attrs:attr_list () in
37 (* Append to current parent *)
38 (match !stack with
39 | parent :: _ -> Html5rw.Dom.append_child parent node
40 | [] -> ());
41 (* Push onto stack *)
42 stack := node :: !stack;
43 process ()
44 | `El_end ->
45 (* Pop from stack *)
46 (match !stack with
47 | _ :: rest -> stack := rest
48 | [] -> ());
49 process ()
50 | `Data text ->
51 (* Create text node and append to current parent *)
52 let trimmed = String.trim text in
53 if trimmed <> "" || String.length text > 0 then begin
54 let text_node = Html5rw.Dom.create_text text in
55 (match !stack with
56 | parent :: _ -> Html5rw.Dom.append_child parent text_node
57 | [] -> ())
58 end;
59 process ()
60 end
61 in
62
63 try
64 process ();
65 Ok root
66 with
67 | Xmlm.Error ((line, col), err) ->
68 Error (Printf.sprintf "XML parse error at %d:%d: %s" line col (Xmlm.error_message err))
69
70(** Check if a system_id indicates an XHTML file. *)
71let is_xhtml_file system_id =
72 match system_id with
73 | Some path ->
74 String.length path > 6 &&
75 String.sub path (String.length path - 6) 6 = ".xhtml"
76 | None -> false
77
78(** Wrap DOM in an Html5rw.t-compatible structure for the checker. *)
79type xhtml_doc = {
80 root : Html5rw.Dom.node;
81 errors : Html5rw.Error.t list;
82}
83
84let xhtml_root doc = doc.root
85let xhtml_errors _doc = [] (* XML parser handles errors differently *)