OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at 3549498fc6a104ca0f4e4921dd84f58e25e5826d 248 lines 8.2 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6(* HTML5 DOM node types *) 7 8type doctype_data = { 9 name : string option; 10 public_id : string option; 11 system_id : string option; 12} 13 14(** Source location for nodes *) 15type location = { 16 line : int; 17 column : int; 18 end_line : int option; 19 end_column : int option; 20} 21 22type quirks_mode = No_quirks | Quirks | Limited_quirks 23 24type node = { 25 mutable name : string; 26 mutable namespace : string option; (* None = html, Some "svg", Some "mathml" *) 27 mutable attrs : (string * string) list; 28 mutable children : node list; 29 mutable parent : node option; 30 mutable data : string; (* For text, comment nodes *) 31 mutable template_content : node option; (* For <template> elements *) 32 mutable doctype : doctype_data option; (* For doctype nodes *) 33 mutable location : location option; (* Source location where node was parsed *) 34} 35 36(* Node name constants *) 37let document_name = "#document" 38let document_fragment_name = "#document-fragment" 39let text_name = "#text" 40let comment_name = "#comment" 41let doctype_name = "!doctype" 42 43(* Base node constructor - all nodes share this structure *) 44let make_node ~name ?(namespace=None) ?(attrs=[]) ?(data="") ?template_content ?doctype ?location () = { 45 name; 46 namespace; 47 attrs; 48 children = []; 49 parent = None; 50 data; 51 template_content; 52 doctype; 53 location; 54} 55 56(* Constructors *) 57let create_element name ?(namespace=None) ?(attrs=[]) ?location () = 58 make_node ~name ~namespace ~attrs ?location () 59 60let create_text ?location data = 61 make_node ~name:text_name ~data ?location () 62 63let create_comment ?location data = 64 make_node ~name:comment_name ~data ?location () 65 66let create_document () = 67 make_node ~name:document_name () 68 69let create_document_fragment () = 70 make_node ~name:document_fragment_name () 71 72let create_doctype ?name ?public_id ?system_id ?location () = 73 make_node ~name:doctype_name ~doctype:{ name; public_id; system_id } ?location () 74 75let create_template ?(namespace=None) ?(attrs=[]) ?location () = 76 let node = create_element "template" ~namespace ~attrs ?location () in 77 node.template_content <- Some (create_document_fragment ()); 78 node 79 80(* Predicates *) 81let is_element node = 82 not (List.mem node.name [text_name; comment_name; document_name; document_fragment_name; doctype_name]) 83 84let is_text node = node.name = text_name 85let is_comment node = node.name = comment_name 86let is_document node = node.name = document_name 87let is_document_fragment node = node.name = document_fragment_name 88let is_doctype node = node.name = doctype_name 89let has_children node = node.children <> [] 90 91(* DOM manipulation *) 92let append_child parent child = 93 child.parent <- Some parent; 94 parent.children <- parent.children @ [child] 95 96let insert_before parent new_child ref_child = 97 new_child.parent <- Some parent; 98 let rec insert acc = function 99 | [] -> List.rev acc @ [new_child] 100 | x :: xs when x == ref_child -> List.rev acc @ [new_child; x] @ xs 101 | x :: xs -> insert (x :: acc) xs 102 in 103 parent.children <- insert [] parent.children 104 105let remove_child parent child = 106 child.parent <- None; 107 parent.children <- List.filter (fun c -> c != child) parent.children 108 109(* Find the last text node before a reference point *) 110let last_child_text parent = 111 match List.rev parent.children with 112 | last :: _ when is_text last -> Some last 113 | _ -> None 114 115let insert_text_at parent text before_node = 116 match before_node with 117 | None -> 118 (* Append - merge with last child if it's text *) 119 (match last_child_text parent with 120 | Some txt -> txt.data <- txt.data ^ text 121 | None -> append_child parent (create_text text)) 122 | Some ref -> 123 (* Find last text node before ref_child *) 124 let rec find_prev_text = function 125 | [] | [_] -> None 126 | prev :: curr :: _ when curr == ref && is_text prev -> Some prev 127 | _ :: rest -> find_prev_text rest 128 in 129 match find_prev_text parent.children with 130 | Some txt -> txt.data <- txt.data ^ text 131 | None -> insert_before parent (create_text text) ref 132 133(* Location helpers *) 134let make_location ~line ~column ?end_line ?end_column () = 135 { line; column; end_line; end_column } 136 137let set_location node ~line ~column ?end_line ?end_column () = 138 node.location <- Some { line; column; end_line; end_column } 139 140let get_location node = node.location 141 142(* Attribute helpers *) 143let get_attr node name = List.assoc_opt name node.attrs 144 145let set_attr node name value = 146 node.attrs <- List.filter (fun (n, _) -> n <> name) node.attrs @ [(name, value)] 147 148let has_attr node name = List.mem_assoc name node.attrs 149 150(* Whitespace splitting for space-separated attribute values per HTML5 spec. 151 Handles ASCII whitespace: space, tab, newline, carriage return, form feed *) 152let split_on_whitespace s = 153 let is_whitespace = function 154 | ' ' | '\t' | '\n' | '\r' | '\x0c' -> true 155 | _ -> false 156 in 157 let len = String.length s in 158 let rec find_start acc i = 159 if i >= len then List.rev acc 160 else if is_whitespace s.[i] then find_start acc (i + 1) 161 else find_end acc i (i + 1) 162 and find_end acc start i = 163 if i >= len then List.rev (String.sub s start (i - start) :: acc) 164 else if is_whitespace s.[i] then find_start (String.sub s start (i - start) :: acc) (i + 1) 165 else find_end acc start (i + 1) 166 in 167 find_start [] 0 168 169(* Get space-separated attribute as list *) 170let get_attr_list node name = 171 match get_attr node name with 172 | Some s -> split_on_whitespace s 173 | None -> [] 174 175(* Common space-separated attribute accessors *) 176let get_class_list node = get_attr_list node "class" 177let get_rel_list node = List.map String.lowercase_ascii (get_attr_list node "rel") 178let get_headers_list node = get_attr_list node "headers" 179let get_itemref_list node = get_attr_list node "itemref" 180let get_itemprop_list node = get_attr_list node "itemprop" 181let get_itemtype_list node = get_attr_list node "itemtype" 182 183(* Tree traversal *) 184let rec descendants node = 185 List.concat_map (fun n -> n :: descendants n) node.children 186 187let ancestors node = 188 let rec collect acc n = 189 match n.parent with 190 | None -> List.rev acc 191 | Some p -> collect (p :: acc) p 192 in 193 collect [] node 194 195let rec get_text_content node = 196 if is_text node then node.data 197 else String.concat "" (List.map get_text_content node.children) 198 199(* Clone *) 200let rec clone ?(deep=false) node = 201 let new_node = make_node 202 ~name:node.name 203 ~namespace:node.namespace 204 ~attrs:node.attrs 205 ~data:node.data 206 ?doctype:node.doctype 207 ?location:node.location 208 () 209 in 210 if deep then begin 211 new_node.children <- List.map (clone ~deep:true) node.children; 212 List.iter (fun c -> c.parent <- Some new_node) new_node.children; 213 Option.iter (fun tc -> 214 new_node.template_content <- Some (clone ~deep:true tc) 215 ) node.template_content 216 end; 217 new_node 218 219(* Pretty printers *) 220let pp_doctype_data fmt (d : doctype_data) = 221 Format.fprintf fmt "<!DOCTYPE %s%s%s>" 222 (Option.value ~default:"" d.name) 223 (match d.public_id with Some p -> " PUBLIC \"" ^ p ^ "\"" | None -> "") 224 (match d.system_id with Some s -> " \"" ^ s ^ "\"" | None -> "") 225 226let pp_quirks_mode fmt = function 227 | No_quirks -> Format.pp_print_string fmt "no-quirks" 228 | Quirks -> Format.pp_print_string fmt "quirks" 229 | Limited_quirks -> Format.pp_print_string fmt "limited-quirks" 230 231let pp fmt node = 232 if is_text node then 233 Format.fprintf fmt "#text %S" node.data 234 else if is_comment node then 235 Format.fprintf fmt "<!-- %s -->" node.data 236 else if is_document node then 237 Format.pp_print_string fmt "#document" 238 else if is_document_fragment node then 239 Format.pp_print_string fmt "#document-fragment" 240 else if is_doctype node then 241 (match node.doctype with 242 | Some d -> pp_doctype_data fmt d 243 | None -> Format.pp_print_string fmt "<!DOCTYPE>") 244 else begin 245 Format.fprintf fmt "<%s" node.name; 246 List.iter (fun (k, v) -> Format.fprintf fmt " %s=%S" k v) node.attrs; 247 Format.pp_print_char fmt '>' 248 end