OCaml HTML5 parser/serialiser based on Python's JustHTML

perf

Changed files
+141 -116
lib
+13 -13
lib/htmlrw_check/context_tracker.ml
··· 41 41 (** Iterate over all contexts (top to bottom). *) 42 42 val iter : 'a t -> ('a -> unit) -> unit 43 43 end = struct 44 - type 'a t = { mutable stack : 'a list } 44 + type 'a t = { mutable stack : 'a list; mutable len : int } 45 45 46 - let create () = { stack = [] } 47 - let reset t = t.stack <- [] 48 - let push t x = t.stack <- x :: t.stack 46 + let create () = { stack = []; len = 0 } 47 + let reset t = t.stack <- []; t.len <- 0 48 + let push t x = t.stack <- x :: t.stack; t.len <- t.len + 1 49 49 let pop t = match t.stack with 50 50 | [] -> None 51 - | x :: rest -> t.stack <- rest; Some x 51 + | x :: rest -> t.stack <- rest; t.len <- t.len - 1; Some x 52 52 let current t = match t.stack with 53 53 | [] -> None 54 54 | x :: _ -> Some x 55 - let depth t = List.length t.stack 56 - let is_empty t = t.stack = [] 55 + let depth t = t.len (* O(1) instead of O(n) *) 56 + let is_empty t = t.len = 0 57 57 let to_list t = List.rev t.stack 58 58 let exists t f = List.exists f t.stack 59 59 let find t f = List.find_opt f t.stack ··· 124 124 (** Get all ancestor names (outermost first). *) 125 125 val to_list : t -> string list 126 126 end = struct 127 - type t = { mutable stack : string list } 127 + type t = { mutable stack : string list; mutable len : int } 128 128 129 - let create () = { stack = [] } 130 - let reset t = t.stack <- [] 131 - let push t name = t.stack <- name :: t.stack 129 + let create () = { stack = []; len = 0 } 130 + let reset t = t.stack <- []; t.len <- 0 131 + let push t name = t.stack <- name :: t.stack; t.len <- t.len + 1 132 132 let pop t = match t.stack with 133 - | _ :: rest -> t.stack <- rest 133 + | _ :: rest -> t.stack <- rest; t.len <- t.len - 1 134 134 | [] -> () 135 135 let parent t = match t.stack with 136 136 | x :: _ -> Some x 137 137 | [] -> None 138 138 let has_ancestor t name = List.mem name t.stack 139 - let depth t = List.length t.stack 139 + let depth t = t.len (* O(1) instead of O(n) *) 140 140 let to_list t = List.rev t.stack 141 141 end
+6 -3
lib/htmlrw_check/datatype/datatype.ml
··· 42 42 else String.sub s start (end_pos - start + 1) 43 43 44 44 (** Factory for creating enum-based validators. 45 - Many HTML attributes accept a fixed set of keyword values. *) 45 + Many HTML attributes accept a fixed set of keyword values. 46 + Uses Hashtbl for O(1) membership check. *) 46 47 let make_enum ~name ~values ?(allow_empty = true) () : t = 47 - let values_set = List.map String.lowercase_ascii values in 48 + (* Pre-compute hashtable for O(1) membership *) 49 + let values_tbl = Hashtbl.create (List.length values) in 50 + List.iter (fun v -> Hashtbl.add values_tbl (String.lowercase_ascii v) ()) values; 48 51 let values_str = String.concat ", " (List.map (Printf.sprintf "'%s'") values) in 49 52 (module struct 50 53 let name = name 51 54 let validate s = 52 55 let s_lower = string_to_ascii_lowercase s in 53 - if (allow_empty && s = "") || List.mem s_lower values_set then Ok () 56 + if (allow_empty && s = "") || Hashtbl.mem values_tbl s_lower then Ok () 54 57 else Error (Printf.sprintf "The value '%s' is not a valid %s value. Expected %s%s." 55 58 s name (if allow_empty then "empty string, " else "") values_str) 56 59 let is_valid s = Result.is_ok (validate s)
+73 -72
lib/htmlrw_check/element/tag.ml
··· 157 157 158 158 (** {1 Conversion Functions} *) 159 159 160 - (** Convert a lowercase tag name string to html_tag option *) 161 - let html_tag_of_string_opt name = 162 - match name with 163 - (* Document metadata *) 164 - | "html" -> Some `Html | "head" -> Some `Head | "title" -> Some `Title 165 - | "base" -> Some `Base | "link" -> Some `Link | "meta" -> Some `Meta 166 - | "style" -> Some `Style 167 - (* Sectioning root *) 168 - | "body" -> Some `Body 169 - (* Content sectioning *) 170 - | "address" -> Some `Address | "article" -> Some `Article | "aside" -> Some `Aside 171 - | "footer" -> Some `Footer | "header" -> Some `Header | "hgroup" -> Some `Hgroup 172 - | "main" -> Some `Main | "nav" -> Some `Nav | "search" -> Some `Search 173 - | "section" -> Some `Section 174 - (* Headings *) 175 - | "h1" -> Some `H1 | "h2" -> Some `H2 | "h3" -> Some `H3 176 - | "h4" -> Some `H4 | "h5" -> Some `H5 | "h6" -> Some `H6 177 - (* Grouping content *) 178 - | "blockquote" -> Some `Blockquote | "dd" -> Some `Dd | "div" -> Some `Div 179 - | "dl" -> Some `Dl | "dt" -> Some `Dt | "figcaption" -> Some `Figcaption 180 - | "figure" -> Some `Figure | "hr" -> Some `Hr | "li" -> Some `Li 181 - | "menu" -> Some `Menu | "ol" -> Some `Ol | "p" -> Some `P 182 - | "pre" -> Some `Pre | "ul" -> Some `Ul 183 - (* Text-level semantics *) 184 - | "a" -> Some `A | "abbr" -> Some `Abbr | "b" -> Some `B 185 - | "bdi" -> Some `Bdi | "bdo" -> Some `Bdo | "br" -> Some `Br 186 - | "cite" -> Some `Cite | "code" -> Some `Code | "data" -> Some `Data 187 - | "dfn" -> Some `Dfn | "em" -> Some `Em | "i" -> Some `I 188 - | "kbd" -> Some `Kbd | "mark" -> Some `Mark | "q" -> Some `Q 189 - | "rp" -> Some `Rp | "rt" -> Some `Rt | "ruby" -> Some `Ruby 190 - | "s" -> Some `S | "samp" -> Some `Samp | "small" -> Some `Small 191 - | "span" -> Some `Span | "strong" -> Some `Strong | "sub" -> Some `Sub 192 - | "sup" -> Some `Sup | "time" -> Some `Time | "u" -> Some `U 193 - | "var" -> Some `Var | "wbr" -> Some `Wbr 194 - (* Edits *) 195 - | "del" -> Some `Del | "ins" -> Some `Ins 196 - (* Embedded content *) 197 - | "area" -> Some `Area | "audio" -> Some `Audio | "canvas" -> Some `Canvas 198 - | "embed" -> Some `Embed | "iframe" -> Some `Iframe | "img" -> Some `Img 199 - | "map" -> Some `Map | "object" -> Some `Object | "picture" -> Some `Picture 200 - | "source" -> Some `Source | "track" -> Some `Track | "video" -> Some `Video 201 - (* Tabular data *) 202 - | "caption" -> Some `Caption | "col" -> Some `Col | "colgroup" -> Some `Colgroup 203 - | "table" -> Some `Table | "tbody" -> Some `Tbody | "td" -> Some `Td 204 - | "tfoot" -> Some `Tfoot | "th" -> Some `Th | "thead" -> Some `Thead 205 - | "tr" -> Some `Tr 206 - (* Forms *) 207 - | "button" -> Some `Button | "datalist" -> Some `Datalist 208 - | "fieldset" -> Some `Fieldset | "form" -> Some `Form | "input" -> Some `Input 209 - | "label" -> Some `Label | "legend" -> Some `Legend | "meter" -> Some `Meter 210 - | "optgroup" -> Some `Optgroup | "option" -> Some `Option 211 - | "output" -> Some `Output | "progress" -> Some `Progress 212 - | "select" -> Some `Select | "textarea" -> Some `Textarea 213 - (* Interactive *) 214 - | "details" -> Some `Details | "dialog" -> Some `Dialog | "summary" -> Some `Summary 215 - (* Scripting *) 216 - | "noscript" -> Some `Noscript | "script" -> Some `Script 217 - | "slot" -> Some `Slot | "template" -> Some `Template 218 - (* Web Components / Misc *) 219 - | "portal" -> Some `Portal | "param" -> Some `Param 220 - (* Deprecated/obsolete elements *) 221 - | "applet" -> Some `Applet | "acronym" -> Some `Acronym | "bgsound" -> Some `Bgsound 222 - | "dir" -> Some `Dir | "frame" -> Some `Frame | "frameset" -> Some `Frameset 223 - | "noframes" -> Some `Noframes | "isindex" -> Some `Isindex | "keygen" -> Some `Keygen 224 - | "listing" -> Some `Listing | "menuitem" -> Some `Menuitem | "nextid" -> Some `Nextid 225 - | "noembed" -> Some `Noembed | "plaintext" -> Some `Plaintext 226 - | "rb" -> Some `Rb | "rtc" -> Some `Rtc | "strike" -> Some `Strike | "xmp" -> Some `Xmp 227 - | "basefont" -> Some `Basefont | "big" -> Some `Big | "blink" -> Some `Blink 228 - | "center" -> Some `Center | "font" -> Some `Font | "marquee" -> Some `Marquee 229 - | "multicol" -> Some `Multicol | "nobr" -> Some `Nobr | "spacer" -> Some `Spacer 230 - | "tt" -> Some `Tt | "image" -> Some `Image 231 - | _ -> None 160 + (** Hashtable for O(1) tag name lookup - initialized once at module load *) 161 + let html_tag_table : (string, html_tag) Hashtbl.t = 162 + let tbl = Hashtbl.create 128 in 163 + List.iter (fun (name, tag) -> Hashtbl.add tbl name tag) [ 164 + (* Document metadata *) 165 + ("html", `Html); ("head", `Head); ("title", `Title); 166 + ("base", `Base); ("link", `Link); ("meta", `Meta); ("style", `Style); 167 + (* Sectioning root *) 168 + ("body", `Body); 169 + (* Content sectioning *) 170 + ("address", `Address); ("article", `Article); ("aside", `Aside); 171 + ("footer", `Footer); ("header", `Header); ("hgroup", `Hgroup); 172 + ("main", `Main); ("nav", `Nav); ("search", `Search); ("section", `Section); 173 + (* Headings *) 174 + ("h1", `H1); ("h2", `H2); ("h3", `H3); 175 + ("h4", `H4); ("h5", `H5); ("h6", `H6); 176 + (* Grouping content *) 177 + ("blockquote", `Blockquote); ("dd", `Dd); ("div", `Div); 178 + ("dl", `Dl); ("dt", `Dt); ("figcaption", `Figcaption); 179 + ("figure", `Figure); ("hr", `Hr); ("li", `Li); 180 + ("menu", `Menu); ("ol", `Ol); ("p", `P); ("pre", `Pre); ("ul", `Ul); 181 + (* Text-level semantics *) 182 + ("a", `A); ("abbr", `Abbr); ("b", `B); 183 + ("bdi", `Bdi); ("bdo", `Bdo); ("br", `Br); 184 + ("cite", `Cite); ("code", `Code); ("data", `Data); 185 + ("dfn", `Dfn); ("em", `Em); ("i", `I); 186 + ("kbd", `Kbd); ("mark", `Mark); ("q", `Q); 187 + ("rp", `Rp); ("rt", `Rt); ("ruby", `Ruby); 188 + ("s", `S); ("samp", `Samp); ("small", `Small); 189 + ("span", `Span); ("strong", `Strong); ("sub", `Sub); 190 + ("sup", `Sup); ("time", `Time); ("u", `U); 191 + ("var", `Var); ("wbr", `Wbr); 192 + (* Edits *) 193 + ("del", `Del); ("ins", `Ins); 194 + (* Embedded content *) 195 + ("area", `Area); ("audio", `Audio); ("canvas", `Canvas); 196 + ("embed", `Embed); ("iframe", `Iframe); ("img", `Img); 197 + ("map", `Map); ("object", `Object); ("picture", `Picture); 198 + ("source", `Source); ("track", `Track); ("video", `Video); 199 + (* Tabular data *) 200 + ("caption", `Caption); ("col", `Col); ("colgroup", `Colgroup); 201 + ("table", `Table); ("tbody", `Tbody); ("td", `Td); 202 + ("tfoot", `Tfoot); ("th", `Th); ("thead", `Thead); ("tr", `Tr); 203 + (* Forms *) 204 + ("button", `Button); ("datalist", `Datalist); 205 + ("fieldset", `Fieldset); ("form", `Form); ("input", `Input); 206 + ("label", `Label); ("legend", `Legend); ("meter", `Meter); 207 + ("optgroup", `Optgroup); ("option", `Option); 208 + ("output", `Output); ("progress", `Progress); 209 + ("select", `Select); ("textarea", `Textarea); 210 + (* Interactive *) 211 + ("details", `Details); ("dialog", `Dialog); ("summary", `Summary); 212 + (* Scripting *) 213 + ("noscript", `Noscript); ("script", `Script); 214 + ("slot", `Slot); ("template", `Template); 215 + (* Web Components / Misc *) 216 + ("portal", `Portal); ("param", `Param); 217 + (* Deprecated/obsolete elements *) 218 + ("applet", `Applet); ("acronym", `Acronym); ("bgsound", `Bgsound); 219 + ("dir", `Dir); ("frame", `Frame); ("frameset", `Frameset); 220 + ("noframes", `Noframes); ("isindex", `Isindex); ("keygen", `Keygen); 221 + ("listing", `Listing); ("menuitem", `Menuitem); ("nextid", `Nextid); 222 + ("noembed", `Noembed); ("plaintext", `Plaintext); 223 + ("rb", `Rb); ("rtc", `Rtc); ("strike", `Strike); ("xmp", `Xmp); 224 + ("basefont", `Basefont); ("big", `Big); ("blink", `Blink); 225 + ("center", `Center); ("font", `Font); ("marquee", `Marquee); 226 + ("multicol", `Multicol); ("nobr", `Nobr); ("spacer", `Spacer); 227 + ("tt", `Tt); ("image", `Image); 228 + ]; 229 + tbl 230 + 231 + (** Convert a lowercase tag name string to html_tag option - O(1) lookup *) 232 + let html_tag_of_string_opt name = Hashtbl.find_opt html_tag_table name 232 233 233 234 (** Check if a name is a valid custom element name (contains hyphen, not reserved) *) 234 235 let is_custom_element_name name =
+21 -11
lib/htmlrw_check/semantic/lang_detecting_checker.ml
··· 16 16 let max_chars = 30720 17 17 let min_chars = 1024 18 18 19 - (* Elements whose text content we skip for language detection *) 20 - let skip_elements = [ 21 - "a"; "button"; "details"; "figcaption"; "form"; "li"; "nav"; 22 - "pre"; "script"; "select"; "span"; "style"; "summary"; 23 - "td"; "textarea"; "th"; "tr" 24 - ] 19 + (* Elements whose text content we skip for language detection - O(1) lookup *) 20 + let skip_elements = 21 + let tbl = Hashtbl.create 20 in 22 + List.iter (fun e -> Hashtbl.add tbl e ()) [ 23 + "a"; "button"; "details"; "figcaption"; "form"; "li"; "nav"; 24 + "pre"; "script"; "select"; "span"; "style"; "summary"; 25 + "td"; "textarea"; "th"; "tr" 26 + ]; 27 + tbl 25 28 26 - (* RTL languages *) 27 - let rtl_langs = ["ar"; "azb"; "ckb"; "dv"; "fa"; "he"; "pnb"; "ps"; "sd"; "ug"; "ur"; "iw"] 29 + let is_skip_element name = Hashtbl.mem skip_elements name 30 + 31 + (* RTL languages - O(1) lookup *) 32 + let rtl_langs = 33 + let tbl = Hashtbl.create 16 in 34 + List.iter (fun l -> Hashtbl.add tbl l ()) ["ar"; "azb"; "ckb"; "dv"; "fa"; "he"; "pnb"; "ps"; "sd"; "ug"; "ur"; "iw"]; 35 + tbl 36 + 37 + let is_rtl_lang lang = Hashtbl.mem rtl_langs lang 28 38 29 39 let create () = { 30 40 html_lang = None; ··· 217 227 if state.foreign_depth > 0 then 218 228 state.foreign_depth <- state.foreign_depth + 1 219 229 (* Check if we should skip this element's text *) 220 - else if List.mem name_lower skip_elements then 230 + else if is_skip_element name_lower then 221 231 state.skip_depth <- state.skip_depth + 1 222 232 else begin 223 233 (* Check for different lang attribute *) ··· 241 251 if state.foreign_depth > 0 then 242 252 state.foreign_depth <- state.foreign_depth - 1 243 253 else if state.skip_depth > 0 then begin 244 - if List.mem name_lower skip_elements then 254 + if is_skip_element name_lower then 245 255 state.skip_depth <- state.skip_depth - 1 246 256 else 247 257 (* TODO: properly track nested elements with different lang *) ··· 313 323 end; 314 324 315 325 (* Check dir attribute for RTL languages *) 316 - if List.mem base_detected rtl_langs then begin 326 + if is_rtl_lang base_detected then begin 317 327 match state.html_dir with 318 328 | None -> 319 329 Message_collector.add_typed collector
+10 -7
lib/htmlrw_check/semantic/nesting_checker.ml
··· 13 13 "s"; "small"; "mark"; "abbr"; "cite"; "code"; "q"; "sub"; "sup"; "samp"; 14 14 "kbd"; "var" |] 15 15 16 + (** Hashtable for O(1) lookup of special ancestor bit positions *) 17 + let special_ancestor_table : (string, int) Hashtbl.t = 18 + let tbl = Hashtbl.create 64 in 19 + Array.iteri (fun i name -> Hashtbl.add tbl name i) special_ancestors; 20 + tbl 21 + 16 22 (** Get the bit position for a special ancestor element. 17 - Returns [-1] if the element is not a special ancestor. *) 23 + Returns [-1] if the element is not a special ancestor. O(1) lookup. *) 18 24 let special_ancestor_number name = 19 - let rec find i = 20 - if i >= Array.length special_ancestors then -1 21 - else if special_ancestors.(i) = name then i 22 - else find (i + 1) 23 - in 24 - find 0 25 + match Hashtbl.find_opt special_ancestor_table name with 26 + | Some i -> i 27 + | None -> -1 25 28 26 29 (** Interactive elements that cannot be nested inside [a] or [button]. *) 27 30 let interactive_elements =
+9 -6
lib/htmlrw_check/semantic/obsolete_checker.ml
··· 188 188 189 189 tbl 190 190 191 - (** Obsolete style attributes map: attr_name -> element_name list *) 192 - let obsolete_style_attrs = 191 + (** Obsolete style attributes map: attr_name -> element_name -> unit hashtable 192 + Uses nested hashtables for O(1) lookup instead of List.mem O(n) *) 193 + let obsolete_style_attrs : (string, (string, unit) Hashtbl.t) Hashtbl.t = 193 194 let tbl = Hashtbl.create 64 in 194 195 195 196 let register attr_name elements = 196 - Hashtbl.add tbl attr_name elements 197 + let elem_tbl = Hashtbl.create (List.length elements) in 198 + List.iter (fun e -> Hashtbl.add elem_tbl e ()) elements; 199 + Hashtbl.add tbl attr_name elem_tbl 197 200 in 198 201 199 202 register "align" ["caption"; "iframe"; "img"; "input"; "object"; "embed"; "legend"; "table"; "hr"; "div"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "p"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"]; ··· 292 295 Message_collector.add_typed collector 293 296 (`Element (`Obsolete_attr (`Elem name, `Attr attr_name, `Suggestion (Some suggestion)))))); 294 297 295 - (* Check obsolete style attributes *) 298 + (* Check obsolete style attributes - O(1) nested hashtable lookup *) 296 299 (match Hashtbl.find_opt obsolete_style_attrs attr_lower with 297 300 | None -> () 298 - | Some elements -> 299 - if List.mem name_lower elements then 301 + | Some elem_tbl -> 302 + if Hashtbl.mem elem_tbl name_lower then 300 303 Message_collector.add_typed collector 301 304 (`Element (`Obsolete_attr (`Elem name, `Attr attr_name, `Suggestion (Some "Use CSS instead."))))); 302 305
+9 -4
lib/htmlrw_check/specialized/label_checker.ml
··· 2 2 Validates that label element contains at most one labelable element 3 3 and that descendants with for attribute have matching ids. *) 4 4 5 - (** Labelable elements that label can reference *) 6 - let labelable_elements = ["button"; "input"; "meter"; "output"; "progress"; "select"; "textarea"] 5 + (** Labelable elements that label can reference - O(1) hashtable lookup *) 6 + let labelable_elements = 7 + let tbl = Hashtbl.create 8 in 8 + List.iter (fun e -> Hashtbl.add tbl e ()) ["button"; "input"; "meter"; "output"; "progress"; "select"; "textarea"]; 9 + tbl 10 + 11 + let is_labelable name = Hashtbl.mem labelable_elements name 7 12 8 13 type label_for_info = { 9 14 for_target : string; ··· 65 70 let name_lower = String.lowercase_ascii (Tag.tag_to_string (Tag.Html tag)) in 66 71 67 72 (* Track labelable element IDs *) 68 - (if List.mem name_lower labelable_elements then 73 + (if is_labelable name_lower then 69 74 match Attr_utils.get_attr "id" element.raw_attrs with 70 75 | Some id -> state.labelable_ids <- id :: state.labelable_ids 71 76 | None -> ()); ··· 74 79 state.label_depth <- state.label_depth + 1; 75 80 76 81 (* Check for labelable elements inside label *) 77 - if List.mem name_lower labelable_elements then begin 82 + if is_labelable name_lower then begin 78 83 state.labelable_count <- state.labelable_count + 1; 79 84 if state.labelable_count > 1 then 80 85 Message_collector.add_typed collector (`Label `Too_many_labelable);