OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6open Brr 7 8(* Helper to compare elements using JavaScript strict equality *) 9let el_equal a b = 10 Jv.strict_equal (El.to_jv a) (El.to_jv b) 11 12(* A location-keyed map for finding elements by line/column *) 13module LocMap = Map.Make(struct 14 type t = int * int 15 let compare = compare 16end) 17 18type t = { 19 root : El.t; 20 html_source : string; 21 loc_to_el : El.t LocMap.t; 22 (* Mapping from (line, column) to browser elements *) 23} 24 25let outer_html el = 26 Jstr.to_string (Jv.get (El.to_jv el) "outerHTML" |> Jv.to_jstr) 27 28let inner_html el = 29 Jstr.to_string (Jv.get (El.to_jv el) "innerHTML" |> Jv.to_jstr) 30 31let iter_elements f root = 32 let rec walk el = 33 f el; 34 List.iter walk (El.children ~only_els:true el) 35 in 36 walk root 37 38let fold_elements f acc root = 39 let rec walk acc el = 40 let acc = f acc el in 41 List.fold_left walk acc (El.children ~only_els:true el) 42 in 43 walk acc root 44 45let filter_elements pred root = 46 fold_elements (fun acc el -> 47 if pred el then el :: acc else acc 48 ) [] root |> List.rev 49 50(* Build element map by walking browser DOM and parsed DOM in parallel *) 51let create root = 52 let raw_html = outer_html root in 53 (* Prepend DOCTYPE if not present - outerHTML doesn't include it *) 54 let html = 55 let lower = String.lowercase_ascii raw_html in 56 if String.length lower >= 9 && String.sub lower 0 9 = "<!doctype" then 57 raw_html 58 else 59 "<!DOCTYPE html>" ^ raw_html 60 in 61 62 (* Parse the HTML to get a tree with locations *) 63 let reader = Bytesrw.Bytes.Reader.of_string html in 64 let parsed = Html5rw.parse ~collect_errors:false reader in 65 66 (* Walk both trees in parallel to build the mapping. 67 Browser elements are in document order, and so are Html5rw nodes. *) 68 let browser_elements = fold_elements (fun acc el -> el :: acc) [] root |> List.rev in 69 70 (* Extract elements from Html5rw DOM in document order *) 71 let rec extract_html5rw_elements acc node = 72 if Html5rw.is_element node then 73 let children = node.Html5rw.Dom.children in 74 let acc = node :: acc in 75 List.fold_left extract_html5rw_elements acc children 76 else 77 let children = node.Html5rw.Dom.children in 78 List.fold_left extract_html5rw_elements acc children 79 in 80 let html5rw_elements = extract_html5rw_elements [] (Html5rw.root parsed) |> List.rev in 81 82 (* Build the location map by matching elements *) 83 let loc_to_el = 84 (* Find the starting point in parsed elements that matches the root tag *) 85 let root_tag = String.lowercase_ascii (Jstr.to_string (El.tag_name root)) in 86 let rec find_start = function 87 | [] -> [] 88 | h_el :: rest -> 89 if String.lowercase_ascii h_el.Html5rw.Dom.name = root_tag then 90 h_el :: rest 91 else 92 find_start rest 93 in 94 let html5rw_elements_aligned = find_start html5rw_elements in 95 96 let rec match_elements loc_map browser_els html5rw_els = 97 match browser_els, html5rw_els with 98 | [], _ | _, [] -> loc_map 99 | b_el :: b_rest, h_el :: h_rest -> 100 let b_tag = String.lowercase_ascii (Jstr.to_string (El.tag_name b_el)) in 101 let h_tag = String.lowercase_ascii h_el.Html5rw.Dom.name in 102 if b_tag = h_tag then 103 (* Tags match - record the mapping if we have a location *) 104 let loc_map = 105 match h_el.Html5rw.Dom.location with 106 | Some loc -> LocMap.add (loc.line, loc.column) b_el loc_map 107 | None -> loc_map 108 in 109 match_elements loc_map b_rest h_rest 110 else 111 (* Tags don't match - try skipping the parsed element first *) 112 (* This handles cases where parser creates implicit elements *) 113 match_elements loc_map browser_els h_rest 114 in 115 match_elements LocMap.empty browser_elements html5rw_elements_aligned 116 in 117 118 { root; html_source = html; loc_to_el }, html 119 120let find_by_location t ~line ~column = 121 LocMap.find_opt (line, column) t.loc_to_el 122 123let find_by_location_and_tag t ~line ~column ~tag = 124 match LocMap.find_opt (line, column) t.loc_to_el with 125 | Some el when String.lowercase_ascii (Jstr.to_string (El.tag_name el)) = 126 String.lowercase_ascii tag -> 127 Some el 128 | _ -> None 129 130let find_for_message t msg = 131 (* Try to find element by location first *) 132 match msg.Htmlrw_check.location with 133 | Some loc -> 134 (match msg.Htmlrw_check.element with 135 | Some tag -> find_by_location_and_tag t ~line:loc.line ~column:loc.column ~tag 136 | None -> find_by_location t ~line:loc.line ~column:loc.column) 137 | None -> 138 (* No location - try to find by element name if we have one *) 139 match msg.Htmlrw_check.element with 140 | Some tag -> 141 (* Find first element with this tag *) 142 let matches = filter_elements (fun el -> 143 String.lowercase_ascii (Jstr.to_string (El.tag_name el)) = 144 String.lowercase_ascii tag 145 ) t.root in 146 (match matches with 147 | el :: _ -> Some el 148 | [] -> None) 149 | None -> None 150 151let html_source t = t.html_source 152 153let root_element t = t.root 154 155let selector_path ?root el = 156 let stop_at = match root with 157 | Some r -> Some r 158 | None -> None 159 in 160 let rec build_path el acc = 161 (* Stop if we've reached the root *) 162 let should_stop = match stop_at with 163 | Some r -> el_equal el r 164 | None -> String.lowercase_ascii (Jstr.to_string (El.tag_name el)) = "body" 165 in 166 if should_stop then 167 acc 168 else 169 let tag = String.lowercase_ascii (Jstr.to_string (El.tag_name el)) in 170 let segment = 171 match El.parent el with 172 | None -> tag 173 | Some parent -> 174 let siblings = El.children ~only_els:true parent in 175 let same_tag = List.filter (fun sib -> 176 String.lowercase_ascii (Jstr.to_string (El.tag_name sib)) = tag 177 ) siblings in 178 if List.length same_tag <= 1 then 179 tag 180 else 181 let idx = 182 let rec find_idx i = function 183 | [] -> 1 184 | sib :: rest -> 185 if el_equal sib el then i 186 else find_idx (i + 1) rest 187 in 188 find_idx 1 same_tag 189 in 190 Printf.sprintf "%s:nth-of-type(%d)" tag idx 191 in 192 let new_acc = segment :: acc in 193 match El.parent el with 194 | None -> new_acc 195 | Some parent -> build_path parent new_acc 196 in 197 String.concat " > " (build_path el []) 198 199let short_selector ?root el = 200 (* Try ID first *) 201 match El.at (Jstr.v "id") el with 202 | Some id when not (Jstr.is_empty id) -> 203 "#" ^ Jstr.to_string id 204 | _ -> 205 (* Try parent ID + short path *) 206 let rec find_id_ancestor el depth = 207 if depth > 3 then None 208 else match El.parent el with 209 | None -> None 210 | Some parent -> 211 match El.at (Jstr.v "id") parent with 212 | Some id when not (Jstr.is_empty id) -> Some (parent, id) 213 | _ -> find_id_ancestor parent (depth + 1) 214 in 215 match find_id_ancestor el 0 with 216 | Some (ancestor, id) -> 217 let path = selector_path ~root:ancestor el in 218 "#" ^ Jstr.to_string id ^ " > " ^ path 219 | None -> 220 selector_path ?root el