OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(* HTML5 DOM node types *)
7
8type doctype_data = {
9 name : string option;
10 public_id : string option;
11 system_id : string option;
12}
13
14(** Source location for nodes *)
15type location = {
16 line : int;
17 column : int;
18 end_line : int option;
19 end_column : int option;
20}
21
22type quirks_mode = No_quirks | Quirks | Limited_quirks
23
24type node = {
25 mutable name : string;
26 mutable namespace : string option; (* None = html, Some "svg", Some "mathml" *)
27 mutable attrs : (string * string) list;
28 mutable children : node list;
29 mutable parent : node option;
30 mutable data : string; (* For text, comment nodes *)
31 mutable template_content : node option; (* For <template> elements *)
32 mutable doctype : doctype_data option; (* For doctype nodes *)
33 mutable location : location option; (* Source location where node was parsed *)
34}
35
36(* Node name constants *)
37let document_name = "#document"
38let document_fragment_name = "#document-fragment"
39let text_name = "#text"
40let comment_name = "#comment"
41let doctype_name = "!doctype"
42
43(* Base node constructor - all nodes share this structure *)
44let make_node ~name ?(namespace=None) ?(attrs=[]) ?(data="") ?template_content ?doctype ?location () = {
45 name;
46 namespace;
47 attrs;
48 children = [];
49 parent = None;
50 data;
51 template_content;
52 doctype;
53 location;
54}
55
56(* Constructors *)
57let create_element name ?(namespace=None) ?(attrs=[]) ?location () =
58 make_node ~name ~namespace ~attrs ?location ()
59
60let create_text ?location data =
61 make_node ~name:text_name ~data ?location ()
62
63let create_comment ?location data =
64 make_node ~name:comment_name ~data ?location ()
65
66let create_document () =
67 make_node ~name:document_name ()
68
69let create_document_fragment () =
70 make_node ~name:document_fragment_name ()
71
72let create_doctype ?name ?public_id ?system_id ?location () =
73 make_node ~name:doctype_name ~doctype:{ name; public_id; system_id } ?location ()
74
75let create_template ?(namespace=None) ?(attrs=[]) ?location () =
76 let node = create_element "template" ~namespace ~attrs ?location () in
77 node.template_content <- Some (create_document_fragment ());
78 node
79
80(* Predicates *)
81let is_element node =
82 not (List.mem node.name [text_name; comment_name; document_name; document_fragment_name; doctype_name])
83
84let is_text node = node.name = text_name
85let is_comment node = node.name = comment_name
86let is_document node = node.name = document_name
87let is_document_fragment node = node.name = document_fragment_name
88let is_doctype node = node.name = doctype_name
89let has_children node = node.children <> []
90
91(* DOM manipulation *)
92let append_child parent child =
93 child.parent <- Some parent;
94 parent.children <- parent.children @ [child]
95
96let insert_before parent new_child ref_child =
97 new_child.parent <- Some parent;
98 let rec insert acc = function
99 | [] -> List.rev acc @ [new_child]
100 | x :: xs when x == ref_child -> List.rev acc @ [new_child; x] @ xs
101 | x :: xs -> insert (x :: acc) xs
102 in
103 parent.children <- insert [] parent.children
104
105let remove_child parent child =
106 child.parent <- None;
107 parent.children <- List.filter (fun c -> c != child) parent.children
108
109(* Find the last text node before a reference point *)
110let last_child_text parent =
111 match List.rev parent.children with
112 | last :: _ when is_text last -> Some last
113 | _ -> None
114
115let insert_text_at parent text before_node =
116 match before_node with
117 | None ->
118 (* Append - merge with last child if it's text *)
119 (match last_child_text parent with
120 | Some txt -> txt.data <- txt.data ^ text
121 | None -> append_child parent (create_text text))
122 | Some ref ->
123 (* Find last text node before ref_child *)
124 let rec find_prev_text = function
125 | [] | [_] -> None
126 | prev :: curr :: _ when curr == ref && is_text prev -> Some prev
127 | _ :: rest -> find_prev_text rest
128 in
129 match find_prev_text parent.children with
130 | Some txt -> txt.data <- txt.data ^ text
131 | None -> insert_before parent (create_text text) ref
132
133(* Location helpers *)
134let make_location ~line ~column ?end_line ?end_column () =
135 { line; column; end_line; end_column }
136
137let set_location node ~line ~column ?end_line ?end_column () =
138 node.location <- Some { line; column; end_line; end_column }
139
140let get_location node = node.location
141
142(* Attribute helpers *)
143let get_attr node name = List.assoc_opt name node.attrs
144
145let set_attr node name value =
146 node.attrs <- List.filter (fun (n, _) -> n <> name) node.attrs @ [(name, value)]
147
148let has_attr node name = List.mem_assoc name node.attrs
149
150(* Whitespace splitting for space-separated attribute values per HTML5 spec.
151 Handles ASCII whitespace: space, tab, newline, carriage return, form feed *)
152let split_on_whitespace s =
153 let is_whitespace = function
154 | ' ' | '\t' | '\n' | '\r' | '\x0c' -> true
155 | _ -> false
156 in
157 let len = String.length s in
158 let rec find_start acc i =
159 if i >= len then List.rev acc
160 else if is_whitespace s.[i] then find_start acc (i + 1)
161 else find_end acc i (i + 1)
162 and find_end acc start i =
163 if i >= len then List.rev (String.sub s start (i - start) :: acc)
164 else if is_whitespace s.[i] then find_start (String.sub s start (i - start) :: acc) (i + 1)
165 else find_end acc start (i + 1)
166 in
167 find_start [] 0
168
169(* Get space-separated attribute as list *)
170let get_attr_list node name =
171 match get_attr node name with
172 | Some s -> split_on_whitespace s
173 | None -> []
174
175(* Common space-separated attribute accessors *)
176let get_class_list node = get_attr_list node "class"
177let get_rel_list node = List.map String.lowercase_ascii (get_attr_list node "rel")
178let get_headers_list node = get_attr_list node "headers"
179let get_itemref_list node = get_attr_list node "itemref"
180let get_itemprop_list node = get_attr_list node "itemprop"
181let get_itemtype_list node = get_attr_list node "itemtype"
182
183(* Tree traversal *)
184let rec descendants node =
185 List.concat_map (fun n -> n :: descendants n) node.children
186
187let ancestors node =
188 let rec collect acc n =
189 match n.parent with
190 | None -> List.rev acc
191 | Some p -> collect (p :: acc) p
192 in
193 collect [] node
194
195let rec get_text_content node =
196 if is_text node then node.data
197 else String.concat "" (List.map get_text_content node.children)
198
199(* Clone *)
200let rec clone ?(deep=false) node =
201 let new_node = make_node
202 ~name:node.name
203 ~namespace:node.namespace
204 ~attrs:node.attrs
205 ~data:node.data
206 ?doctype:node.doctype
207 ?location:node.location
208 ()
209 in
210 if deep then begin
211 new_node.children <- List.map (clone ~deep:true) node.children;
212 List.iter (fun c -> c.parent <- Some new_node) new_node.children;
213 Option.iter (fun tc ->
214 new_node.template_content <- Some (clone ~deep:true tc)
215 ) node.template_content
216 end;
217 new_node
218
219(* Pretty printers *)
220let pp_doctype_data fmt (d : doctype_data) =
221 Format.fprintf fmt "<!DOCTYPE %s%s%s>"
222 (Option.value ~default:"" d.name)
223 (match d.public_id with Some p -> " PUBLIC \"" ^ p ^ "\"" | None -> "")
224 (match d.system_id with Some s -> " \"" ^ s ^ "\"" | None -> "")
225
226let pp_quirks_mode fmt = function
227 | No_quirks -> Format.pp_print_string fmt "no-quirks"
228 | Quirks -> Format.pp_print_string fmt "quirks"
229 | Limited_quirks -> Format.pp_print_string fmt "limited-quirks"
230
231let pp fmt node =
232 if is_text node then
233 Format.fprintf fmt "#text %S" node.data
234 else if is_comment node then
235 Format.fprintf fmt "<!-- %s -->" node.data
236 else if is_document node then
237 Format.pp_print_string fmt "#document"
238 else if is_document_fragment node then
239 Format.pp_print_string fmt "#document-fragment"
240 else if is_doctype node then
241 (match node.doctype with
242 | Some d -> pp_doctype_data fmt d
243 | None -> Format.pp_print_string fmt "<!DOCTYPE>")
244 else begin
245 Format.fprintf fmt "<%s" node.name;
246 List.iter (fun (k, v) -> Format.fprintf fmt " %s=%S" k v) node.attrs;
247 Format.pp_print_char fmt '>'
248 end