OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6open Brr
7
8(* Helper to compare elements using JavaScript strict equality *)
9let el_equal a b =
10 Jv.strict_equal (El.to_jv a) (El.to_jv b)
11
12(* A location-keyed map for finding elements by line/column *)
13module LocMap = Map.Make(struct
14 type t = int * int
15 let compare = compare
16end)
17
18type t = {
19 root : El.t;
20 html_source : string;
21 loc_to_el : El.t LocMap.t;
22 (* Mapping from (line, column) to browser elements *)
23}
24
25let outer_html el =
26 Jstr.to_string (Jv.get (El.to_jv el) "outerHTML" |> Jv.to_jstr)
27
28let inner_html el =
29 Jstr.to_string (Jv.get (El.to_jv el) "innerHTML" |> Jv.to_jstr)
30
31let iter_elements f root =
32 let rec walk el =
33 f el;
34 List.iter walk (El.children ~only_els:true el)
35 in
36 walk root
37
38let fold_elements f acc root =
39 let rec walk acc el =
40 let acc = f acc el in
41 List.fold_left walk acc (El.children ~only_els:true el)
42 in
43 walk acc root
44
45let filter_elements pred root =
46 fold_elements (fun acc el ->
47 if pred el then el :: acc else acc
48 ) [] root |> List.rev
49
50(* Build element map by walking browser DOM and parsed DOM in parallel *)
51let create root =
52 let raw_html = outer_html root in
53 (* Prepend DOCTYPE if not present - outerHTML doesn't include it *)
54 let html =
55 let lower = String.lowercase_ascii raw_html in
56 if String.length lower >= 9 && String.sub lower 0 9 = "<!doctype" then
57 raw_html
58 else
59 "<!DOCTYPE html>" ^ raw_html
60 in
61
62 (* Parse the HTML to get a tree with locations *)
63 let reader = Bytesrw.Bytes.Reader.of_string html in
64 let parsed = Html5rw.parse ~collect_errors:false reader in
65
66 (* Walk both trees in parallel to build the mapping.
67 Browser elements are in document order, and so are Html5rw nodes. *)
68 let browser_elements = fold_elements (fun acc el -> el :: acc) [] root |> List.rev in
69
70 (* Extract elements from Html5rw DOM in document order *)
71 let rec extract_html5rw_elements acc node =
72 if Html5rw.is_element node then
73 let children = node.Html5rw.Dom.children in
74 let acc = node :: acc in
75 List.fold_left extract_html5rw_elements acc children
76 else
77 let children = node.Html5rw.Dom.children in
78 List.fold_left extract_html5rw_elements acc children
79 in
80 let html5rw_elements = extract_html5rw_elements [] (Html5rw.root parsed) |> List.rev in
81
82 (* Build the location map by matching elements *)
83 let loc_to_el =
84 (* Find the starting point in parsed elements that matches the root tag *)
85 let root_tag = String.lowercase_ascii (Jstr.to_string (El.tag_name root)) in
86 let rec find_start = function
87 | [] -> []
88 | h_el :: rest ->
89 if String.lowercase_ascii h_el.Html5rw.Dom.name = root_tag then
90 h_el :: rest
91 else
92 find_start rest
93 in
94 let html5rw_elements_aligned = find_start html5rw_elements in
95
96 let rec match_elements loc_map browser_els html5rw_els =
97 match browser_els, html5rw_els with
98 | [], _ | _, [] -> loc_map
99 | b_el :: b_rest, h_el :: h_rest ->
100 let b_tag = String.lowercase_ascii (Jstr.to_string (El.tag_name b_el)) in
101 let h_tag = String.lowercase_ascii h_el.Html5rw.Dom.name in
102 if b_tag = h_tag then
103 (* Tags match - record the mapping if we have a location *)
104 let loc_map =
105 match h_el.Html5rw.Dom.location with
106 | Some loc -> LocMap.add (loc.line, loc.column) b_el loc_map
107 | None -> loc_map
108 in
109 match_elements loc_map b_rest h_rest
110 else
111 (* Tags don't match - try skipping the parsed element first *)
112 (* This handles cases where parser creates implicit elements *)
113 match_elements loc_map browser_els h_rest
114 in
115 match_elements LocMap.empty browser_elements html5rw_elements_aligned
116 in
117
118 { root; html_source = html; loc_to_el }, html
119
120let find_by_location t ~line ~column =
121 LocMap.find_opt (line, column) t.loc_to_el
122
123let find_by_location_and_tag t ~line ~column ~tag =
124 match LocMap.find_opt (line, column) t.loc_to_el with
125 | Some el when String.lowercase_ascii (Jstr.to_string (El.tag_name el)) =
126 String.lowercase_ascii tag ->
127 Some el
128 | _ -> None
129
130let find_for_message t msg =
131 (* Try to find element by location first *)
132 match msg.Htmlrw_check.location with
133 | Some loc ->
134 (match msg.Htmlrw_check.element with
135 | Some tag -> find_by_location_and_tag t ~line:loc.line ~column:loc.column ~tag
136 | None -> find_by_location t ~line:loc.line ~column:loc.column)
137 | None ->
138 (* No location - try to find by element name if we have one *)
139 match msg.Htmlrw_check.element with
140 | Some tag ->
141 (* Find first element with this tag *)
142 let matches = filter_elements (fun el ->
143 String.lowercase_ascii (Jstr.to_string (El.tag_name el)) =
144 String.lowercase_ascii tag
145 ) t.root in
146 (match matches with
147 | el :: _ -> Some el
148 | [] -> None)
149 | None -> None
150
151let html_source t = t.html_source
152
153let root_element t = t.root
154
155let selector_path ?root el =
156 let stop_at = match root with
157 | Some r -> Some r
158 | None -> None
159 in
160 let rec build_path el acc =
161 (* Stop if we've reached the root *)
162 let should_stop = match stop_at with
163 | Some r -> el_equal el r
164 | None -> String.lowercase_ascii (Jstr.to_string (El.tag_name el)) = "body"
165 in
166 if should_stop then
167 acc
168 else
169 let tag = String.lowercase_ascii (Jstr.to_string (El.tag_name el)) in
170 let segment =
171 match El.parent el with
172 | None -> tag
173 | Some parent ->
174 let siblings = El.children ~only_els:true parent in
175 let same_tag = List.filter (fun sib ->
176 String.lowercase_ascii (Jstr.to_string (El.tag_name sib)) = tag
177 ) siblings in
178 if List.length same_tag <= 1 then
179 tag
180 else
181 let idx =
182 let rec find_idx i = function
183 | [] -> 1
184 | sib :: rest ->
185 if el_equal sib el then i
186 else find_idx (i + 1) rest
187 in
188 find_idx 1 same_tag
189 in
190 Printf.sprintf "%s:nth-of-type(%d)" tag idx
191 in
192 let new_acc = segment :: acc in
193 match El.parent el with
194 | None -> new_acc
195 | Some parent -> build_path parent new_acc
196 in
197 String.concat " > " (build_path el [])
198
199let short_selector ?root el =
200 (* Try ID first *)
201 match El.at (Jstr.v "id") el with
202 | Some id when not (Jstr.is_empty id) ->
203 "#" ^ Jstr.to_string id
204 | _ ->
205 (* Try parent ID + short path *)
206 let rec find_id_ancestor el depth =
207 if depth > 3 then None
208 else match El.parent el with
209 | None -> None
210 | Some parent ->
211 match El.at (Jstr.v "id") parent with
212 | Some id when not (Jstr.is_empty id) -> Some (parent, id)
213 | _ -> find_id_ancestor parent (depth + 1)
214 in
215 match find_id_ancestor el 0 with
216 | Some (ancestor, id) ->
217 let path = selector_path ~root:ancestor el in
218 "#" ^ Jstr.to_string id ^ " > " ^ path
219 | None ->
220 selector_path ?root el