OCaml HTML5 parser/serialiser based on Python's JustHTML

trim

Changed files
-332
lib
-96
lib/check/ancestor_tracker.ml
··· 1 - (*--------------------------------------------------------------------------- 2 - Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 - SPDX-License-Identifier: MIT 4 - ---------------------------------------------------------------------------*) 5 - 6 - (** Generic ancestor tracking for DOM traversal. *) 7 - 8 - type 'data context = { 9 - name : string; 10 - data : 'data; 11 - } 12 - 13 - type 'data t = { 14 - mutable stack : 'data context list; 15 - } 16 - 17 - let create () = { stack = [] } 18 - 19 - let reset tracker = tracker.stack <- [] 20 - 21 - let push tracker name data = 22 - let name_lower = Astring.String.Ascii.lowercase name in 23 - let context = { name = name_lower; data } in 24 - tracker.stack <- context :: tracker.stack 25 - 26 - let pop tracker = 27 - match tracker.stack with 28 - | [] -> () (* Gracefully handle underflow *) 29 - | _ :: rest -> tracker.stack <- rest 30 - 31 - let peek tracker = 32 - match tracker.stack with 33 - | [] -> None 34 - | hd :: _ -> Some hd 35 - 36 - let depth tracker = List.length tracker.stack 37 - 38 - let has_ancestor tracker name = 39 - let name_lower = Astring.String.Ascii.lowercase name in 40 - List.exists (fun ctx -> String.equal ctx.name name_lower) tracker.stack 41 - 42 - let has_ancestor_with tracker predicate = 43 - List.exists (fun ctx -> predicate ctx.name ctx.data) tracker.stack 44 - 45 - let find_ancestor tracker name = 46 - let name_lower = Astring.String.Ascii.lowercase name in 47 - List.find_opt (fun ctx -> String.equal ctx.name name_lower) tracker.stack 48 - 49 - let find_ancestor_with tracker predicate = 50 - List.find_opt (fun ctx -> predicate ctx.name ctx.data) tracker.stack 51 - 52 - let get_all_ancestors tracker = tracker.stack 53 - 54 - let filter_ancestors tracker predicate = 55 - List.filter (fun ctx -> predicate ctx.name ctx.data) tracker.stack 56 - 57 - let exists = has_ancestor_with 58 - 59 - let for_all tracker predicate = 60 - List.for_all (fun ctx -> predicate ctx.name ctx.data) tracker.stack 61 - 62 - let iter tracker f = 63 - List.iter (fun ctx -> f ctx.name ctx.data) tracker.stack 64 - 65 - let fold tracker f init = 66 - List.fold_left (fun acc ctx -> f acc ctx.name ctx.data) init tracker.stack 67 - 68 - let get_parent = peek 69 - 70 - let get_parent_name tracker = 71 - match peek tracker with 72 - | Some ctx -> Some ctx.name 73 - | None -> None 74 - 75 - let get_parent_data tracker = 76 - match peek tracker with 77 - | Some ctx -> Some ctx.data 78 - | None -> None 79 - 80 - let has_any_ancestor tracker names = 81 - let names_lower = List.map Astring.String.Ascii.lowercase names in 82 - List.exists (fun ctx -> List.mem ctx.name names_lower) tracker.stack 83 - 84 - let find_first_matching tracker names = 85 - let names_lower = List.map Astring.String.Ascii.lowercase names in 86 - List.find_map (fun ctx -> 87 - if List.mem ctx.name names_lower then Some (ctx.name, ctx) 88 - else None 89 - ) tracker.stack 90 - 91 - let is_empty tracker = tracker.stack = [] 92 - 93 - let to_list = get_all_ancestors 94 - 95 - let to_name_list tracker = 96 - List.map (fun ctx -> ctx.name) tracker.stack
-236
lib/check/ancestor_tracker.mli
··· 1 - (*--------------------------------------------------------------------------- 2 - Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 - SPDX-License-Identifier: MIT 4 - ---------------------------------------------------------------------------*) 5 - 6 - (** Generic ancestor tracking for DOM traversal. 7 - 8 - This module provides a generic stack-based ancestor tracker that can be 9 - used by various HTML checkers to track element nesting during DOM 10 - traversal. It is parameterized by the type of data stored per element. 11 - 12 - {2 Design} 13 - 14 - The tracker maintains a stack of element contexts, where each context 15 - contains: 16 - - The element name (string) 17 - - Custom data of type ['data] (checker-specific) 18 - 19 - The stack is automatically managed through [push] and [pop] operations 20 - that should be called in response to start_element and end_element events. 21 - 22 - {2 Usage Example} 23 - 24 - {[ 25 - (* Define custom data type *) 26 - type role_data = { 27 - explicit_roles : string list; 28 - implicit_role : string option; 29 - } 30 - 31 - (* Create tracker *) 32 - let tracker = Ancestor_tracker.create () 33 - 34 - (* Push element context *) 35 - let data = { explicit_roles = ["button"]; implicit_role = None } in 36 - Ancestor_tracker.push tracker "div" data; 37 - 38 - (* Query ancestors *) 39 - let has_button = Ancestor_tracker.exists tracker 40 - (fun name data -> List.mem "button" data.explicit_roles) 41 - in 42 - 43 - (* Pop when element closes *) 44 - Ancestor_tracker.pop tracker 45 - ]} 46 - *) 47 - 48 - (** {1 Types} *) 49 - 50 - (** Ancestor context containing element name and custom data. *) 51 - type 'data context = { 52 - name : string; 53 - (** Element name (lowercase). *) 54 - data : 'data; 55 - (** Checker-specific data. *) 56 - } 57 - 58 - (** Ancestor tracker state. *) 59 - type 'data t 60 - 61 - (** {1 Creation} *) 62 - 63 - val create : unit -> 'data t 64 - (** [create ()] creates a new empty ancestor tracker. *) 65 - 66 - val reset : 'data t -> unit 67 - (** [reset tracker] clears all ancestor contexts from the tracker. *) 68 - 69 - (** {1 Stack Operations} *) 70 - 71 - val push : 'data t -> string -> 'data -> unit 72 - (** [push tracker name data] pushes a new element context onto the stack. 73 - 74 - This should be called in [start_element] event handlers. 75 - 76 - @param tracker The ancestor tracker 77 - @param name The element name (will be lowercased) 78 - @param data Checker-specific data to associate with this element *) 79 - 80 - val pop : 'data t -> unit 81 - (** [pop tracker] pops the most recent element context from the stack. 82 - 83 - This should be called in [end_element] event handlers. 84 - 85 - @param tracker The ancestor tracker *) 86 - 87 - val peek : 'data t -> 'data context option 88 - (** [peek tracker] returns the most recent element context without removing it. 89 - 90 - @param tracker The ancestor tracker 91 - @return [Some context] if the stack is non-empty, [None] otherwise *) 92 - 93 - val depth : 'data t -> int 94 - (** [depth tracker] returns the current stack depth (number of ancestors). 95 - 96 - @param tracker The ancestor tracker 97 - @return The number of elements on the stack *) 98 - 99 - (** {1 Ancestor Queries} *) 100 - 101 - val has_ancestor : 'data t -> string -> bool 102 - (** [has_ancestor tracker name] checks if an element with the given name 103 - exists anywhere in the ancestor chain. 104 - 105 - @param tracker The ancestor tracker 106 - @param name The element name to search for (case-insensitive) 107 - @return [true] if an ancestor with this name exists *) 108 - 109 - val has_ancestor_with : 'data t -> (string -> 'data -> bool) -> bool 110 - (** [has_ancestor_with tracker predicate] checks if any ancestor satisfies 111 - the given predicate. 112 - 113 - @param tracker The ancestor tracker 114 - @param predicate Function that tests element name and data 115 - @return [true] if any ancestor satisfies the predicate *) 116 - 117 - val find_ancestor : 'data t -> string -> 'data context option 118 - (** [find_ancestor tracker name] finds the nearest ancestor with the given name. 119 - 120 - @param tracker The ancestor tracker 121 - @param name The element name to search for (case-insensitive) 122 - @return [Some context] for the nearest matching ancestor, [None] if not found *) 123 - 124 - val find_ancestor_with : 'data t -> (string -> 'data -> bool) -> 'data context option 125 - (** [find_ancestor_with tracker predicate] finds the nearest ancestor that 126 - satisfies the predicate. 127 - 128 - @param tracker The ancestor tracker 129 - @param predicate Function that tests element name and data 130 - @return [Some context] for the nearest matching ancestor, [None] if not found *) 131 - 132 - val get_all_ancestors : 'data t -> 'data context list 133 - (** [get_all_ancestors tracker] returns all ancestor contexts from nearest to root. 134 - 135 - @param tracker The ancestor tracker 136 - @return List of contexts, with the most recent first *) 137 - 138 - val filter_ancestors : 'data t -> (string -> 'data -> bool) -> 'data context list 139 - (** [filter_ancestors tracker predicate] returns all ancestors that satisfy 140 - the predicate. 141 - 142 - @param tracker The ancestor tracker 143 - @param predicate Function that tests element name and data 144 - @return List of matching contexts, from nearest to root *) 145 - 146 - val exists : 'data t -> (string -> 'data -> bool) -> bool 147 - (** [exists tracker predicate] checks if any ancestor satisfies the predicate. 148 - 149 - This is an alias for {!has_ancestor_with}. 150 - 151 - @param tracker The ancestor tracker 152 - @param predicate Function that tests element name and data 153 - @return [true] if any ancestor satisfies the predicate *) 154 - 155 - val for_all : 'data t -> (string -> 'data -> bool) -> bool 156 - (** [for_all tracker predicate] checks if all ancestors satisfy the predicate. 157 - 158 - @param tracker The ancestor tracker 159 - @param predicate Function that tests element name and data 160 - @return [true] if all ancestors satisfy the predicate (vacuously true for empty stack) *) 161 - 162 - val iter : 'data t -> (string -> 'data -> unit) -> unit 163 - (** [iter tracker f] applies function [f] to each ancestor from nearest to root. 164 - 165 - @param tracker The ancestor tracker 166 - @param f Function to apply to each ancestor *) 167 - 168 - val fold : 'data t -> ('acc -> string -> 'data -> 'acc) -> 'acc -> 'acc 169 - (** [fold tracker f init] folds over ancestors from nearest to root. 170 - 171 - @param tracker The ancestor tracker 172 - @param f Folding function 173 - @param init Initial accumulator value 174 - @return Final accumulator value *) 175 - 176 - (** {1 Parent Access} *) 177 - 178 - val get_parent : 'data t -> 'data context option 179 - (** [get_parent tracker] returns the immediate parent element context. 180 - 181 - This is equivalent to {!peek}. 182 - 183 - @param tracker The ancestor tracker 184 - @return [Some context] for the parent, [None] if at root *) 185 - 186 - val get_parent_name : 'data t -> string option 187 - (** [get_parent_name tracker] returns the immediate parent element name. 188 - 189 - @param tracker The ancestor tracker 190 - @return [Some name] for the parent, [None] if at root *) 191 - 192 - val get_parent_data : 'data t -> 'data option 193 - (** [get_parent_data tracker] returns the immediate parent's custom data. 194 - 195 - @param tracker The ancestor tracker 196 - @return [Some data] for the parent, [None] if at root *) 197 - 198 - (** {1 Multiple Ancestor Queries} *) 199 - 200 - val has_any_ancestor : 'data t -> string list -> bool 201 - (** [has_any_ancestor tracker names] checks if any of the given element names 202 - exists in the ancestor chain. 203 - 204 - @param tracker The ancestor tracker 205 - @param names List of element names to search for 206 - @return [true] if any name matches an ancestor *) 207 - 208 - val find_first_matching : 'data t -> string list -> (string * 'data context) option 209 - (** [find_first_matching tracker names] finds the nearest ancestor that matches 210 - any of the given names. 211 - 212 - @param tracker The ancestor tracker 213 - @param names List of element names to search for 214 - @return [Some (matched_name, context)] for the first match, [None] if no match *) 215 - 216 - (** {1 Stack Inspection} *) 217 - 218 - val is_empty : 'data t -> bool 219 - (** [is_empty tracker] checks if the stack is empty (at document root). 220 - 221 - @param tracker The ancestor tracker 222 - @return [true] if no elements are on the stack *) 223 - 224 - val to_list : 'data t -> 'data context list 225 - (** [to_list tracker] converts the stack to a list of contexts. 226 - 227 - This is an alias for {!get_all_ancestors}. 228 - 229 - @param tracker The ancestor tracker 230 - @return List of contexts from nearest to root *) 231 - 232 - val to_name_list : 'data t -> string list 233 - (** [to_name_list tracker] returns just the element names in the stack. 234 - 235 - @param tracker The ancestor tracker 236 - @return List of element names from nearest to root *)