OCaml HTML5 parser/serialiser based on Python's JustHTML

perf

+264
bench/bench.ml
··· 1 + (* Simple benchmarks for HTML5 parsing and validation *) 2 + 3 + let time_it name iterations f = 4 + Gc.full_major (); 5 + let start = Unix.gettimeofday () in 6 + for _ = 1 to iterations do 7 + ignore (f ()) 8 + done; 9 + let elapsed = Unix.gettimeofday () -. start in 10 + let per_iter = elapsed /. float_of_int iterations *. 1000.0 in 11 + Printf.printf "%-40s %d iters in %.3fs (%.3f ms/iter)\n%!" name iterations elapsed per_iter 12 + 13 + (* Generate HTML with nested elements *) 14 + let generate_nested_html depth = 15 + let buf = Buffer.create 4096 in 16 + Buffer.add_string buf "<!DOCTYPE html><html><head><title>Test</title></head><body>"; 17 + for i = 1 to depth do 18 + Buffer.add_string buf (Printf.sprintf "<div id=\"d%d\" class=\"c1 c2 c3\">" i) 19 + done; 20 + Buffer.add_string buf "<p>Content</p>"; 21 + for _ = 1 to depth do 22 + Buffer.add_string buf "</div>" 23 + done; 24 + Buffer.add_string buf "</body></html>"; 25 + Buffer.contents buf 26 + 27 + (* Generate HTML with many sibling elements *) 28 + let generate_wide_html count = 29 + let buf = Buffer.create (count * 100) in 30 + Buffer.add_string buf "<!DOCTYPE html><html><head><title>Test</title></head><body><div>"; 31 + for i = 1 to count do 32 + Buffer.add_string buf (Printf.sprintf "<span id=\"s%d\" class=\"cls\" data-value=\"%d\">Item %d</span>" i i i) 33 + done; 34 + Buffer.add_string buf "</div></body></html>"; 35 + Buffer.contents buf 36 + 37 + (* Generate HTML with table *) 38 + let generate_table_html rows cols = 39 + let buf = Buffer.create (rows * cols * 50) in 40 + Buffer.add_string buf "<!DOCTYPE html><html><head><title>Table</title></head><body><table>"; 41 + Buffer.add_string buf "<thead><tr>"; 42 + for c = 1 to cols do 43 + Buffer.add_string buf (Printf.sprintf "<th>Col %d</th>" c) 44 + done; 45 + Buffer.add_string buf "</tr></thead><tbody>"; 46 + for r = 1 to rows do 47 + Buffer.add_string buf "<tr>"; 48 + for c = 1 to cols do 49 + Buffer.add_string buf (Printf.sprintf "<td>R%dC%d</td>" r c) 50 + done; 51 + Buffer.add_string buf "</tr>" 52 + done; 53 + Buffer.add_string buf "</tbody></table></body></html>"; 54 + Buffer.contents buf 55 + 56 + (* Generate HTML with interactive elements (tests nesting checker) *) 57 + let generate_interactive_html count = 58 + let buf = Buffer.create (count * 200) in 59 + Buffer.add_string buf "<!DOCTYPE html><html><head><title>Forms</title></head><body>"; 60 + for i = 1 to count do 61 + Buffer.add_string buf (Printf.sprintf 62 + "<form id=\"f%d\"><label for=\"i%d\">Label</label><input type=\"text\" id=\"i%d\" name=\"n%d\"><button type=\"submit\">Submit</button></form>" i i i i) 63 + done; 64 + Buffer.add_string buf "</body></html>"; 65 + Buffer.contents buf 66 + 67 + (* Generate HTML with validation errors *) 68 + let generate_invalid_html count = 69 + let buf = Buffer.create (count * 100) in 70 + Buffer.add_string buf "<!DOCTYPE html><html><head><title>Invalid</title></head><body>"; 71 + for i = 1 to count do 72 + (* Various validation errors *) 73 + Buffer.add_string buf (Printf.sprintf "<div role=\"invalid%d\">" i); 74 + Buffer.add_string buf "<img>"; (* missing alt *) 75 + Buffer.add_string buf "<a href=\"#\"><button>Nested interactive</button></a>"; 76 + Buffer.add_string buf "</div>" 77 + done; 78 + Buffer.add_string buf "</body></html>"; 79 + Buffer.contents buf 80 + 81 + (* Benchmark parsing only *) 82 + let bench_parse html iterations = 83 + let bytes = Bytes.of_string html in 84 + time_it "Parse only" iterations (fun () -> 85 + Html5rw.parse_bytes bytes 86 + ) 87 + 88 + (* Benchmark validation only (on pre-parsed DOM) *) 89 + let bench_validate doc iterations = 90 + time_it "Validate only" iterations (fun () -> 91 + Htmlrw_check.check_parsed doc 92 + ) 93 + 94 + (* Benchmark parse + validate *) 95 + let bench_parse_and_validate html iterations = 96 + time_it "Parse + Validate" iterations (fun () -> 97 + Htmlrw_check.check_string html 98 + ) 99 + 100 + (* Benchmark repeated message access (tests caching via check result) *) 101 + let bench_message_access result iterations = 102 + time_it "Message access (10x errors/warnings)" iterations (fun () -> 103 + for _ = 1 to 10 do 104 + ignore (Htmlrw_check.errors result); 105 + ignore (Htmlrw_check.warnings result); 106 + ignore (Htmlrw_check.messages result) 107 + done 108 + ) 109 + 110 + (* Micro-benchmarks for specific optimizations *) 111 + 112 + (* Test List.mem vs Hashtbl.mem for table element lookups *) 113 + let bench_list_mem iterations = 114 + let elements = ["table"; "tbody"; "tfoot"; "thead"; "tr"] in 115 + let test_names = ["table"; "div"; "tbody"; "span"; "tr"; "p"; "thead"; "a"] in 116 + time_it "List.mem (5 elements, 8 lookups)" iterations (fun () -> 117 + List.iter (fun name -> 118 + ignore (List.mem name elements) 119 + ) test_names 120 + ) 121 + 122 + let bench_hashtbl_mem iterations = 123 + let tbl = Hashtbl.create 8 in 124 + List.iter (fun e -> Hashtbl.add tbl e ()) ["table"; "tbody"; "tfoot"; "thead"; "tr"]; 125 + let test_names = ["table"; "div"; "tbody"; "span"; "tr"; "p"; "thead"; "a"] in 126 + time_it "Hashtbl.mem (5 elements, 8 lookups)" iterations (fun () -> 127 + List.iter (fun name -> 128 + ignore (Hashtbl.mem tbl name) 129 + ) test_names 130 + ) 131 + 132 + (* Test noncharacter validation - bitwise vs List.mem *) 133 + let bench_nonchar_list iterations = 134 + let nonchars = [0xFFFE; 0xFFFF; 0x1FFFE; 0x1FFFF; 0x2FFFE; 0x2FFFF; 135 + 0x3FFFE; 0x3FFFF; 0x4FFFE; 0x4FFFF; 0x5FFFE; 0x5FFFF; 136 + 0x6FFFE; 0x6FFFF; 0x7FFFE; 0x7FFFF; 0x8FFFE; 0x8FFFF; 137 + 0x9FFFE; 0x9FFFF; 0xAFFFE; 0xAFFFF; 0xBFFFE; 0xBFFFF; 138 + 0xCFFFE; 0xCFFFF; 0xDFFFE; 0xDFFFF; 0xEFFFE; 0xEFFFF; 139 + 0xFFFFE; 0xFFFFF; 0x10FFFE; 0x10FFFF] in 140 + let test_codes = [0x41; 0xFFFE; 0x1000; 0x10FFFF; 0xFDD0; 0x3FFFE; 0x100; 0xFFFF] in 141 + time_it "List.mem nonchar (34 elements, 8 checks)" iterations (fun () -> 142 + List.iter (fun code -> 143 + ignore (List.mem code nonchars) 144 + ) test_codes 145 + ) 146 + 147 + let bench_nonchar_bitwise iterations = 148 + let test_codes = [0x41; 0xFFFE; 0x1000; 0x10FFFF; 0xFDD0; 0x3FFFE; 0x100; 0xFFFF] in 149 + time_it "Bitwise nonchar (8 checks)" iterations (fun () -> 150 + List.iter (fun code -> 151 + let low16 = code land 0xFFFF in 152 + ignore (low16 = 0xFFFE || low16 = 0xFFFF) 153 + ) test_codes 154 + ) 155 + 156 + (* Test message list reversal - once vs cached *) 157 + let bench_list_rev iterations = 158 + let msgs = List.init 100 (fun i -> Printf.sprintf "Message %d" i) in 159 + time_it "List.rev (100 msgs, 10 accesses)" iterations (fun () -> 160 + for _ = 1 to 10 do 161 + ignore (List.rev msgs) 162 + done 163 + ) 164 + 165 + let bench_cached_rev iterations = 166 + let msgs = List.init 100 (fun i -> Printf.sprintf "Message %d" i) in 167 + let cached = ref None in 168 + time_it "Cached rev (100 msgs, 10 accesses)" iterations (fun () -> 169 + cached := None; (* Reset cache each iteration *) 170 + for _ = 1 to 10 do 171 + match !cached with 172 + | Some c -> ignore c 173 + | None -> 174 + let rev = List.rev msgs in 175 + cached := Some rev; 176 + ignore rev 177 + done 178 + ) 179 + 180 + let () = 181 + Printf.printf "\n=== HTML5 Parser/Validator Benchmarks ===\n\n"; 182 + 183 + (* Micro-benchmarks first *) 184 + Printf.printf "--- Micro-benchmarks: Optimized Code Paths ---\n"; 185 + Printf.printf "\nTable element lookup (List.mem vs Hashtbl.mem):\n"; 186 + bench_list_mem 100000; 187 + bench_hashtbl_mem 100000; 188 + 189 + Printf.printf "\nNoncharacter validation (List.mem vs bitwise):\n"; 190 + bench_nonchar_list 100000; 191 + bench_nonchar_bitwise 100000; 192 + 193 + Printf.printf "\nMessage list reversal (repeated vs cached):\n"; 194 + bench_list_rev 10000; 195 + bench_cached_rev 10000; 196 + Printf.printf "\n"; 197 + 198 + (* Small documents *) 199 + Printf.printf "--- Small Document (100 nested divs) ---\n"; 200 + let small_nested = generate_nested_html 100 in 201 + Printf.printf "Document size: %d bytes\n" (String.length small_nested); 202 + bench_parse small_nested 1000; 203 + let doc = Html5rw.parse_bytes (Bytes.of_string small_nested) in 204 + bench_validate doc 1000; 205 + bench_parse_and_validate small_nested 1000; 206 + Printf.printf "\n"; 207 + 208 + (* Wide documents *) 209 + Printf.printf "--- Wide Document (1000 siblings) ---\n"; 210 + let wide = generate_wide_html 1000 in 211 + Printf.printf "Document size: %d bytes\n" (String.length wide); 212 + bench_parse wide 100; 213 + let doc = Html5rw.parse_bytes (Bytes.of_string wide) in 214 + bench_validate doc 100; 215 + bench_parse_and_validate wide 100; 216 + Printf.printf "\n"; 217 + 218 + (* Table documents *) 219 + Printf.printf "--- Table Document (100x20) ---\n"; 220 + let table = generate_table_html 100 20 in 221 + Printf.printf "Document size: %d bytes\n" (String.length table); 222 + bench_parse table 100; 223 + let doc = Html5rw.parse_bytes (Bytes.of_string table) in 224 + bench_validate doc 100; 225 + bench_parse_and_validate table 100; 226 + Printf.printf "\n"; 227 + 228 + (* Interactive elements (nesting checker stress test) *) 229 + Printf.printf "--- Interactive Elements (200 forms) ---\n"; 230 + let interactive = generate_interactive_html 200 in 231 + Printf.printf "Document size: %d bytes\n" (String.length interactive); 232 + bench_parse interactive 100; 233 + let doc = Html5rw.parse_bytes (Bytes.of_string interactive) in 234 + bench_validate doc 100; 235 + bench_parse_and_validate interactive 100; 236 + Printf.printf "\n"; 237 + 238 + (* Invalid HTML (validation error generation) *) 239 + Printf.printf "--- Invalid HTML (100 error patterns) ---\n"; 240 + let invalid = generate_invalid_html 100 in 241 + Printf.printf "Document size: %d bytes\n" (String.length invalid); 242 + bench_parse invalid 100; 243 + let doc = Html5rw.parse_bytes (Bytes.of_string invalid) in 244 + bench_validate doc 100; 245 + bench_parse_and_validate invalid 100; 246 + Printf.printf "\n"; 247 + 248 + (* Message access benchmark - tests caching *) 249 + Printf.printf "--- Message Access (invalid HTML result, 10 accesses) ---\n"; 250 + let result = Htmlrw_check.check_string invalid in 251 + bench_message_access result 1000; 252 + Printf.printf "\n"; 253 + 254 + (* Large document *) 255 + Printf.printf "--- Large Document (500 nested divs) ---\n"; 256 + let large_nested = generate_nested_html 500 in 257 + Printf.printf "Document size: %d bytes\n" (String.length large_nested); 258 + bench_parse large_nested 100; 259 + let doc = Html5rw.parse_bytes (Bytes.of_string large_nested) in 260 + bench_validate doc 100; 261 + bench_parse_and_validate large_nested 100; 262 + Printf.printf "\n"; 263 + 264 + Printf.printf "=== Benchmarks Complete ===\n"
+3
bench/dune
··· 1 + (executable 2 + (name bench) 3 + (libraries html5rw htmlrw_check unix))
+47
lib/html5rw/dom/dom.mli
··· 737 737 val has_attr : node -> string -> bool 738 738 (** [has_attr node name] returns [true] if the node has attribute [name]. *) 739 739 740 + (** {1 Space-Separated Attribute Values} 741 + 742 + Many HTML attributes contain space-separated lists of values. For example, 743 + the [class] attribute contains CSS class names: [class="header main active"]. 744 + These functions parse such attributes into OCaml lists. 745 + 746 + Per the HTML5 spec, "ASCII whitespace" (space, tab, newline, carriage return, 747 + form feed) is used as the separator. 748 + *) 749 + 750 + val split_on_whitespace : string -> string list 751 + (** [split_on_whitespace s] splits a string on ASCII whitespace. 752 + 753 + This implements the HTML5 "split on ASCII whitespace" algorithm. 754 + 755 + {b Example:} 756 + {[ 757 + split_on_whitespace "foo bar\tbaz" 758 + (* Returns: ["foo"; "bar"; "baz"] *) 759 + ]} 760 + *) 761 + 762 + val get_attr_list : node -> string -> string list 763 + (** [get_attr_list node name] returns a space-separated attribute as a list. 764 + 765 + Returns an empty list if the attribute doesn't exist. 766 + *) 767 + 768 + val get_class_list : node -> string list 769 + (** [get_class_list node] returns the class attribute as a list of class names. *) 770 + 771 + val get_rel_list : node -> string list 772 + (** [get_rel_list node] returns the rel attribute as a list of link types 773 + (lowercased since they are case-insensitive). *) 774 + 775 + val get_headers_list : node -> string list 776 + (** [get_headers_list node] returns the headers attribute as a list of IDs. *) 777 + 778 + val get_itemref_list : node -> string list 779 + (** [get_itemref_list node] returns the itemref attribute as a list of IDs. *) 780 + 781 + val get_itemprop_list : node -> string list 782 + (** [get_itemprop_list node] returns the itemprop attribute as a list. *) 783 + 784 + val get_itemtype_list : node -> string list 785 + (** [get_itemtype_list node] returns the itemtype attribute as a list of URLs. *) 786 + 740 787 (** {1 Location Helpers} *) 741 788 742 789 val make_location : line:int -> column:int -> ?end_line:int -> ?end_column:int ->
+33
lib/html5rw/dom/dom_node.ml
··· 147 147 148 148 let has_attr node name = List.mem_assoc name node.attrs 149 149 150 + (* Whitespace splitting for space-separated attribute values per HTML5 spec. 151 + Handles ASCII whitespace: space, tab, newline, carriage return, form feed *) 152 + let split_on_whitespace s = 153 + let is_whitespace = function 154 + | ' ' | '\t' | '\n' | '\r' | '\x0c' -> true 155 + | _ -> false 156 + in 157 + let len = String.length s in 158 + let rec find_start acc i = 159 + if i >= len then List.rev acc 160 + else if is_whitespace s.[i] then find_start acc (i + 1) 161 + else find_end acc i (i + 1) 162 + and find_end acc start i = 163 + if i >= len then List.rev (String.sub s start (i - start) :: acc) 164 + else if is_whitespace s.[i] then find_start (String.sub s start (i - start) :: acc) (i + 1) 165 + else find_end acc start (i + 1) 166 + in 167 + find_start [] 0 168 + 169 + (* Get space-separated attribute as list *) 170 + let get_attr_list node name = 171 + match get_attr node name with 172 + | Some s -> split_on_whitespace s 173 + | None -> [] 174 + 175 + (* Common space-separated attribute accessors *) 176 + let get_class_list node = get_attr_list node "class" 177 + let get_rel_list node = List.map String.lowercase_ascii (get_attr_list node "rel") 178 + let get_headers_list node = get_attr_list node "headers" 179 + let get_itemref_list node = get_attr_list node "itemref" 180 + let get_itemprop_list node = get_attr_list node "itemprop" 181 + let get_itemtype_list node = get_attr_list node "itemtype" 182 + 150 183 (* Tree traversal *) 151 184 let rec descendants node = 152 185 List.concat_map (fun n -> n :: descendants n) node.children
+86
lib/html5rw/dom/dom_node.mli
··· 740 740 val has_attr : node -> string -> bool 741 741 (** [has_attr node name] returns [true] if the node has attribute [name]. *) 742 742 743 + (** {1 Space-Separated Attribute Values} 744 + 745 + Many HTML attributes contain space-separated lists of values. For example, 746 + the [class] attribute contains CSS class names: [class="header main active"]. 747 + These functions parse such attributes into OCaml lists. 748 + 749 + Per the HTML5 spec, "ASCII whitespace" (space, tab, newline, carriage return, 750 + form feed) is used as the separator. 751 + *) 752 + 753 + val split_on_whitespace : string -> string list 754 + (** [split_on_whitespace s] splits a string on ASCII whitespace. 755 + 756 + This implements the HTML5 "split on ASCII whitespace" algorithm used 757 + for parsing space-separated attribute values. 758 + 759 + {b Example:} 760 + {[ 761 + split_on_whitespace "foo bar\tbaz" 762 + (* Returns: ["foo"; "bar"; "baz"] *) 763 + ]} 764 + *) 765 + 766 + val get_attr_list : node -> string -> string list 767 + (** [get_attr_list node name] returns a space-separated attribute as a list. 768 + 769 + Returns an empty list if the attribute doesn't exist. 770 + 771 + {b Example:} 772 + {[ 773 + (* For <div class="foo bar baz"> *) 774 + get_attr_list div "class" 775 + (* Returns: ["foo"; "bar"; "baz"] *) 776 + ]} 777 + *) 778 + 779 + val get_class_list : node -> string list 780 + (** [get_class_list node] returns the class attribute as a list of class names. 781 + 782 + Equivalent to [get_attr_list node "class"]. 783 + 784 + {b Example:} 785 + {[ 786 + (* For <div class="container main"> *) 787 + get_class_list div 788 + (* Returns: ["container"; "main"] *) 789 + ]} 790 + *) 791 + 792 + val get_rel_list : node -> string list 793 + (** [get_rel_list node] returns the rel attribute as a list of link types. 794 + 795 + Link types are lowercased since they are case-insensitive per HTML5 spec. 796 + 797 + {b Example:} 798 + {[ 799 + (* For <link rel="stylesheet preload"> *) 800 + get_rel_list link 801 + (* Returns: ["stylesheet"; "preload"] *) 802 + ]} 803 + *) 804 + 805 + val get_headers_list : node -> string list 806 + (** [get_headers_list node] returns the headers attribute as a list of IDs. 807 + 808 + Used on [<td>] and [<th>] elements to associate cells with headers. 809 + *) 810 + 811 + val get_itemref_list : node -> string list 812 + (** [get_itemref_list node] returns the itemref attribute as a list of IDs. 813 + 814 + Used for microdata to reference elements by ID. 815 + *) 816 + 817 + val get_itemprop_list : node -> string list 818 + (** [get_itemprop_list node] returns the itemprop attribute as a list. 819 + 820 + Used for microdata property names. 821 + *) 822 + 823 + val get_itemtype_list : node -> string list 824 + (** [get_itemtype_list node] returns the itemtype attribute as a list of URLs. 825 + 826 + Used for microdata type URLs. 827 + *) 828 + 743 829 (** {1 Location Helpers} 744 830 745 831 Functions to manage source location information for nodes.
+10 -6
lib/html5rw/dom/dom_serialize.ml
··· 8 8 open Bytesrw 9 9 open Dom_node 10 10 11 - (* Void elements that don't have end tags *) 12 - let void_elements = [ 13 - "area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input"; 14 - "link"; "meta"; "source"; "track"; "wbr" 15 - ] 11 + (* Void elements that don't have end tags - O(1) hashtable lookup *) 12 + let void_elements_tbl = 13 + let elements = [ 14 + "area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input"; 15 + "link"; "meta"; "source"; "track"; "wbr" 16 + ] in 17 + let tbl = Hashtbl.create (List.length elements) in 18 + List.iter (fun e -> Hashtbl.add tbl e ()) elements; 19 + tbl 16 20 17 - let is_void name = List.mem name void_elements 21 + let is_void name = Hashtbl.mem void_elements_tbl name 18 22 19 23 (* Foreign attribute adjustments for test output *) 20 24 let foreign_attr_adjustments = [
+32 -8
lib/html5rw/parser/parser_constants.ml
··· 80 80 let mathml_text_integration = ["mi"; "mo"; "mn"; "ms"; "mtext"] 81 81 let mathml_text_integration_tbl = make_set mathml_text_integration 82 82 83 - (* MathML attribute adjustments *) 84 - let mathml_attr_adjustments = [ 85 - ("definitionurl", "definitionURL") 86 - ] 83 + (* MathML attribute adjustments - O(1) hashtable lookup *) 84 + let mathml_attr_adjustments_tbl = 85 + let adjustments = [("definitionurl", "definitionURL")] in 86 + let tbl = Hashtbl.create 4 in 87 + List.iter (fun (k, v) -> Hashtbl.add tbl k v) adjustments; 88 + tbl 87 89 88 90 let adjust_mathml_attrs attrs = 89 91 List.map (fun (k, v) -> 90 - match List.assoc_opt (lowercase k) mathml_attr_adjustments with 92 + match Hashtbl.find_opt mathml_attr_adjustments_tbl (lowercase k) with 91 93 | Some adjusted_k -> (adjusted_k, v) 92 94 | None -> (k, v) 93 95 ) attrs ··· 95 97 (* SVG HTML integration points *) 96 98 let svg_html_integration = ["foreignObject"; "desc"; "title"] 97 99 let svg_html_integration_tbl = make_set (List.map lowercase svg_html_integration) 100 + 101 + (* Helper to create hashtable from association list for O(1) lookup *) 102 + let make_assoc_tbl pairs = 103 + let tbl = Hashtbl.create (List.length pairs) in 104 + List.iter (fun (k, v) -> Hashtbl.add tbl k v) pairs; 105 + tbl 98 106 99 107 (* SVG tag name adjustments *) 100 108 let svg_tag_adjustments = [ ··· 136 144 ("radialgradient", "radialGradient"); 137 145 ("textpath", "textPath"); 138 146 ] 147 + let svg_tag_adjustments_tbl = make_assoc_tbl svg_tag_adjustments 139 148 140 149 (* SVG attribute adjustments *) 141 150 let svg_attr_adjustments = [ ··· 198 207 ("ychannelselector", "yChannelSelector"); 199 208 ("zoomandpan", "zoomAndPan"); 200 209 ] 210 + let svg_attr_adjustments_tbl = make_assoc_tbl svg_attr_adjustments 201 211 202 212 (* Foreign attribute adjustments *) 203 213 let foreign_attr_adjustments = [ ··· 213 223 ("xmlns", ("", "xmlns", "http://www.w3.org/2000/xmlns/")); 214 224 ("xmlns:xlink", ("xmlns", "xlink", "http://www.w3.org/2000/xmlns/")); 215 225 ] 226 + let foreign_attr_adjustments_tbl = make_assoc_tbl foreign_attr_adjustments 216 227 217 228 (* Quirks mode detection *) 218 229 let quirky_public_matches = [ ··· 293 304 "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd" 294 305 ] 295 306 307 + (* Table-related element sets for tree builder O(1) lookups *) 308 + let table_section_elements = ["tbody"; "thead"; "tfoot"] 309 + let table_section_elements_tbl = make_set table_section_elements 310 + 311 + let table_cell_elements = ["td"; "th"] 312 + let table_cell_elements_tbl = make_set table_cell_elements 313 + 314 + let foster_parenting_elements = ["table"; "tbody"; "tfoot"; "thead"; "tr"] 315 + let foster_parenting_elements_tbl = make_set foster_parenting_elements 316 + 296 317 (* Helper functions - O(1) hashtable lookups *) 297 318 let is_void_element name = Hashtbl.mem void_elements_tbl name 298 319 let is_formatting_element name = Hashtbl.mem formatting_elements_tbl name ··· 303 324 let is_mathml_text_integration name = Hashtbl.mem mathml_text_integration_tbl name 304 325 let is_svg_html_integration name = Hashtbl.mem svg_html_integration_tbl (lowercase name) 305 326 let is_select_scope_exclude name = Hashtbl.mem select_scope_exclude_tbl name 327 + let is_table_section_element name = Hashtbl.mem table_section_elements_tbl name 328 + let is_table_cell_element name = Hashtbl.mem table_cell_elements_tbl name 329 + let is_foster_parenting_element name = Hashtbl.mem foster_parenting_elements_tbl name 306 330 307 331 (* Backwards compatibility aliases *) 308 332 let is_void = List.mem ··· 311 335 let is_heading = List.mem 312 336 313 337 let adjust_svg_tag_name name = 314 - match List.assoc_opt (lowercase name) svg_tag_adjustments with 338 + match Hashtbl.find_opt svg_tag_adjustments_tbl (lowercase name) with 315 339 | Some adjusted -> adjusted 316 340 | None -> name 317 341 318 342 let adjust_svg_attrs attrs = 319 343 List.map (fun (name, value) -> 320 344 let adjusted_name = 321 - match List.assoc_opt (lowercase name) svg_attr_adjustments with 345 + match Hashtbl.find_opt svg_attr_adjustments_tbl (lowercase name) with 322 346 | Some n -> n 323 347 | None -> name 324 348 in ··· 327 351 328 352 let adjust_foreign_attrs attrs = 329 353 List.map (fun (name, value) -> 330 - match List.assoc_opt (lowercase name) foreign_attr_adjustments with 354 + match Hashtbl.find_opt foreign_attr_adjustments_tbl (lowercase name) with 331 355 | Some (prefix, local, _ns) -> 332 356 if prefix = "" then (local, value) 333 357 else (prefix ^ ":" ^ local, value)
+11 -11
lib/html5rw/parser/parser_tree_builder.ml
··· 91 91 (* Set initial mode based on context *) 92 92 t.mode <- ( 93 93 if name = "html" then Parser_insertion_mode.Before_head 94 - else if List.mem name ["tbody"; "thead"; "tfoot"] && (ns = None || ns = Some "html") then 94 + else if Parser_constants.is_table_section_element name && (ns = None || ns = Some "html") then 95 95 Parser_insertion_mode.In_table_body 96 96 else if name = "tr" && (ns = None || ns = Some "html") then 97 97 Parser_insertion_mode.In_row 98 - else if List.mem name ["td"; "th"] && (ns = None || ns = Some "html") then 98 + else if Parser_constants.is_table_cell_element name && (ns = None || ns = Some "html") then 99 99 Parser_insertion_mode.In_cell 100 100 else if name = "caption" && (ns = None || ns = Some "html") then 101 101 Parser_insertion_mode.In_caption ··· 160 160 match current_node t with 161 161 | None -> (t.document, None) 162 162 | Some target -> 163 - if t.foster_parenting && List.mem target.Dom.name ["table"; "tbody"; "tfoot"; "thead"; "tr"] then begin 163 + if t.foster_parenting && Parser_constants.is_foster_parenting_element target.Dom.name then begin 164 164 (* Foster parenting per WHATWG spec *) 165 165 (* Step 1: Find last (most recent) template and table in stack *) 166 166 (* Note: index 0 = top of stack = most recently added *) ··· 599 599 | Some p -> Dom.remove_child p !last_node 600 600 | None -> ()); 601 601 (* Check if we need foster parenting *) 602 - if t.foster_parenting && List.mem ca.Dom.name ["table"; "tbody"; "tfoot"; "thead"; "tr"] then begin 602 + if t.foster_parenting && Parser_constants.is_foster_parenting_element ca.Dom.name then begin 603 603 (* Find table and insert before it *) 604 604 let rec find_table = function 605 605 | [] -> None ··· 698 698 end; 699 699 if t.mode <> Parser_insertion_mode.In_select_in_table then 700 700 t.mode <- Parser_insertion_mode.In_select 701 - end else if List.mem name ["td"; "th"] && not is_last then 701 + end else if Parser_constants.is_table_cell_element name && not is_last then 702 702 t.mode <- Parser_insertion_mode.In_cell 703 703 else if name = "tr" then 704 704 t.mode <- Parser_insertion_mode.In_row 705 - else if List.mem name ["tbody"; "thead"; "tfoot"] then 705 + else if Parser_constants.is_table_section_element name then 706 706 t.mode <- Parser_insertion_mode.In_table_body 707 707 else if name = "caption" then 708 708 t.mode <- Parser_insertion_mode.In_caption ··· 1473 1473 1474 1474 and process_in_table t token = 1475 1475 match token with 1476 - | Token.Character _ when (match current_node t with Some n -> List.mem n.Dom.name ["table"; "tbody"; "tfoot"; "thead"; "tr"] | None -> false) -> 1476 + | Token.Character _ when (match current_node t with Some n -> Parser_constants.is_foster_parenting_element n.Dom.name | None -> false) -> 1477 1477 t.pending_table_chars <- []; 1478 1478 t.original_mode <- Some t.mode; 1479 1479 t.mode <- Parser_insertion_mode.In_table_text; ··· 1798 1798 1799 1799 and process_in_cell t token = 1800 1800 match token with 1801 - | Token.Tag { kind = Token.End; name; _ } when List.mem name ["td"; "th"] -> 1801 + | Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_table_cell_element name -> 1802 1802 if not (has_element_in_table_scope t name) then 1803 1803 parse_error t "unexpected-end-tag" 1804 1804 else begin ··· 1822 1822 when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"] -> 1823 1823 parse_error t "unexpected-end-tag" 1824 1824 | Token.Tag { kind = Token.End; name; _ } 1825 - when List.mem name ["table"; "tbody"; "tfoot"; "thead"; "tr"] -> 1825 + when Parser_constants.is_foster_parenting_element name -> 1826 1826 if not (has_element_in_table_scope t name) then 1827 1827 parse_error t "unexpected-end-tag" 1828 1828 else begin ··· 1835 1835 and close_cell t = 1836 1836 generate_implied_end_tags t (); 1837 1837 (match current_node t with 1838 - | Some n when not (List.mem n.Dom.name ["td"; "th"] && is_in_html_namespace n) -> parse_error t "end-tag-too-early" 1838 + | Some n when not (Parser_constants.is_table_cell_element n.Dom.name && is_in_html_namespace n) -> parse_error t "end-tag-too-early" 1839 1839 | _ -> ()); 1840 1840 pop_until_html_one_of t ["td"; "th"]; 1841 1841 clear_active_formatting_to_marker t; ··· 2050 2050 t.template_modes <- Parser_insertion_mode.In_table_body :: t.template_modes; 2051 2051 t.mode <- Parser_insertion_mode.In_table_body; 2052 2052 process_token t token 2053 - | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["td"; "th"] -> 2053 + | Token.Tag { kind = Token.Start; name; _ } when Parser_constants.is_table_cell_element name -> 2054 2054 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2055 2055 t.template_modes <- Parser_insertion_mode.In_row :: t.template_modes; 2056 2056 t.mode <- Parser_insertion_mode.In_row;
+3 -6
lib/html5rw/tokenizer/tokenizer_impl.ml
··· 1943 1943 error t (Printf.sprintf "surrogate-character-reference:%04x" code); 1944 1944 replacement_char 1945 1945 end else if (code >= 0xFDD0 && code <= 0xFDEF) || 1946 - List.mem code [0xFFFE; 0xFFFF; 0x1FFFE; 0x1FFFF; 0x2FFFE; 0x2FFFF; 1947 - 0x3FFFE; 0x3FFFF; 0x4FFFE; 0x4FFFF; 0x5FFFE; 0x5FFFF; 1948 - 0x6FFFE; 0x6FFFF; 0x7FFFE; 0x7FFFF; 0x8FFFE; 0x8FFFF; 1949 - 0x9FFFE; 0x9FFFF; 0xAFFFE; 0xAFFFF; 0xBFFFE; 0xBFFFF; 1950 - 0xCFFFE; 0xCFFFF; 0xDFFFE; 0xDFFFF; 0xEFFFE; 0xEFFFF; 1951 - 0xFFFFE; 0xFFFFF; 0x10FFFE; 0x10FFFF] then begin 1946 + (* Noncharacters end in 0xFFFE or 0xFFFF in each plane (0-16). 1947 + O(1) bitwise check instead of O(n) list membership. *) 1948 + (let low16 = code land 0xFFFF in low16 = 0xFFFE || low16 = 0xFFFF) then begin 1952 1949 error t (Printf.sprintf "noncharacter-character-reference:%05x" code); 1953 1950 Entities.Numeric_ref.codepoint_to_utf8 code 1954 1951 end else if (code >= 0x01 && code <= 0x08) || code = 0x0B ||
+3 -21
lib/htmlrw_check/datatype/datatype.ml
··· 42 42 else String.sub s start (end_pos - start + 1) 43 43 44 44 (** Split string on HTML whitespace characters (space, tab, LF, FF, CR). 45 - Filters out empty tokens. Used for space-separated attribute values. *) 46 - let split_on_whitespace s = 47 - let len = String.length s in 48 - let rec split acc start i = 49 - if i >= len then 50 - if i > start then 51 - List.rev ((String.sub s start (i - start)) :: acc) 52 - else 53 - List.rev acc 54 - else if is_whitespace s.[i] then 55 - let acc' = 56 - if i > start then 57 - (String.sub s start (i - start)) :: acc 58 - else 59 - acc 60 - in 61 - split acc' (i + 1) (i + 1) 62 - else 63 - split acc start (i + 1) 64 - in 65 - split [] 0 0 45 + Filters out empty tokens. Used for space-separated attribute values. 46 + Delegates to the core library implementation. *) 47 + let split_on_whitespace = Html5rw.Dom.split_on_whitespace 66 48 67 49 (** Factory for creating enum-based validators. 68 50 Many HTML attributes accept a fixed set of keyword values.
+2 -2
lib/htmlrw_check/element/attr.ml
··· 872 872 let get_rel attrs = 873 873 List.find_map (function `Rel s -> Some s | _ -> None) attrs 874 874 875 - (** Get rel attribute as list of link types (space-separated) *) 875 + (** Get rel attribute as list of link types (space-separated, lowercased per HTML5 spec) *) 876 876 let get_rel_list attrs = 877 877 match get_rel attrs with 878 - | Some s -> Datatype.split_on_whitespace s 878 + | Some s -> List.map String.lowercase_ascii (Datatype.split_on_whitespace s) 879 879 | None -> [] 880 880 881 881 (** Get headers attribute as raw string *)
+9
lib/htmlrw_check/element/element.ml
··· 104 104 let get_all_aria elem = Attr.get_all_aria elem.attrs 105 105 let get_all_data elem = Attr.get_all_data elem.attrs 106 106 107 + (** Space-separated list getters *) 108 + let get_class_list elem = Attr.get_class_list elem.attrs 109 + let get_rel_list elem = Attr.get_rel_list elem.attrs 110 + let get_headers_list elem = Attr.get_headers_list elem.attrs 111 + let get_itemref_list elem = Attr.get_itemref_list elem.attrs 112 + let get_itemprop_list elem = Attr.get_itemprop_list elem.attrs 113 + let get_itemtype_list elem = Attr.get_itemtype_list elem.attrs 114 + let get_aria_list name elem = Attr.get_aria_list name elem.attrs 115 + 107 116 (** {1 Category Checks} *) 108 117 109 118 (** Check if this is a void element *)
+26
lib/htmlrw_check/element/element.mli
··· 162 162 val get_all_data : t -> (string * string) list 163 163 (** [get_all_data elem] extracts all data-* attributes. *) 164 164 165 + (** {1 Space-Separated List Accessors} 166 + 167 + These functions return attribute values as parsed lists, splitting on 168 + whitespace per HTML5 spec. *) 169 + 170 + val get_class_list : t -> string list 171 + (** [get_class_list elem] returns class names as a list. *) 172 + 173 + val get_rel_list : t -> string list 174 + (** [get_rel_list elem] returns link types as a list. *) 175 + 176 + val get_headers_list : t -> string list 177 + (** [get_headers_list elem] returns header IDs as a list (for td/th). *) 178 + 179 + val get_itemref_list : t -> string list 180 + (** [get_itemref_list elem] returns itemref IDs as a list. *) 181 + 182 + val get_itemprop_list : t -> string list 183 + (** [get_itemprop_list elem] returns itemprop names as a list. *) 184 + 185 + val get_itemtype_list : t -> string list 186 + (** [get_itemtype_list elem] returns itemtype URLs as a list. *) 187 + 188 + val get_aria_list : string -> t -> string list 189 + (** [get_aria_list name elem] returns space-separated ARIA values as a list. *) 190 + 165 191 (** {1 Raw Attribute Fallback} *) 166 192 167 193 val get_raw_attr : string -> t -> string option
+15 -4
lib/htmlrw_check/message_collector.ml
··· 3 3 type t = { 4 4 mutable messages : Message.t list; 5 5 mutable current_location : Message.location option; 6 + mutable cached_reversed : Message.t list option; (* Cache for O(1) repeated access *) 6 7 } 7 8 8 - let create () = { messages = []; current_location = None } 9 + let create () = { messages = []; current_location = None; cached_reversed = None } 9 10 10 11 let set_current_location t location = t.current_location <- location 11 12 let clear_current_location t = t.current_location <- None 12 13 let get_current_location t = t.current_location 13 14 14 - let add t msg = t.messages <- msg :: t.messages 15 + let add t msg = 16 + t.messages <- msg :: t.messages; 17 + t.cached_reversed <- None (* Invalidate cache *) 15 18 16 19 (** Add a message from a typed conformance error code *) 17 20 let add_typed t ?location ?element ?attribute ?extract error_code = ··· 23 26 let msg = Message.of_conformance_error ?location:loc ?element ?attribute ?extract error_code in 24 27 add t msg 25 28 26 - let messages t = List.rev t.messages 29 + let messages t = 30 + match t.cached_reversed with 31 + | Some cached -> cached 32 + | None -> 33 + let reversed = List.rev t.messages in 34 + t.cached_reversed <- Some reversed; 35 + reversed 27 36 28 37 let errors t = 29 38 List.filter (fun msg -> msg.Message.severity = Message.Error) (messages t) ··· 45 54 if msg.Message.severity = Message.Error then acc + 1 else acc) 46 55 0 t.messages 47 56 48 - let clear t = t.messages <- [] 57 + let clear t = 58 + t.messages <- []; 59 + t.cached_reversed <- None
+4 -8
lib/htmlrw_check/semantic/id_checker.ml
··· 55 55 56 56 (** Attributes that reference a single ID - O(1) lookup. *) 57 57 let single_id_ref_attrs = 58 - let tbl = Hashtbl.create 8 in 59 - List.iter (fun a -> Hashtbl.add tbl a ()) [ 58 + Attr_utils.hashtbl_of_list [ 60 59 "for"; (* label *) 61 60 "form"; (* form-associated elements *) 62 61 "list"; (* input *) ··· 64 63 "popovertarget"; (* button - references popover element *) 65 64 "commandfor"; (* button - references element to control *) 66 65 "anchor"; (* popover - references anchor element *) 67 - ]; 68 - tbl 66 + ] 69 67 70 68 let is_single_id_ref_attr name = Hashtbl.mem single_id_ref_attrs name 71 69 72 70 (** Attributes that reference multiple IDs (space-separated) - O(1) lookup. *) 73 71 let multi_id_ref_attrs = 74 - let tbl = Hashtbl.create 8 in 75 - List.iter (fun a -> Hashtbl.add tbl a ()) [ 72 + Attr_utils.hashtbl_of_list [ 76 73 "headers"; (* td, th *) 77 74 "aria-labelledby"; 78 75 "aria-describedby"; ··· 80 77 "aria-flowto"; 81 78 "aria-owns"; 82 79 "itemref"; 83 - ]; 84 - tbl 80 + ] 85 81 86 82 let is_multi_id_ref_attr name = Hashtbl.mem multi_id_ref_attrs name 87 83
+3 -7
lib/htmlrw_check/semantic/lang_detecting_checker.ml
··· 18 18 19 19 (* Elements whose text content we skip for language detection - O(1) lookup *) 20 20 let skip_elements = 21 - let tbl = Hashtbl.create 20 in 22 - List.iter (fun e -> Hashtbl.add tbl e ()) [ 21 + Attr_utils.hashtbl_of_list [ 23 22 "a"; "button"; "details"; "figcaption"; "form"; "li"; "nav"; 24 23 "pre"; "script"; "select"; "span"; "style"; "summary"; 25 24 "td"; "textarea"; "th"; "tr" 26 - ]; 27 - tbl 25 + ] 28 26 29 27 let is_skip_element name = Hashtbl.mem skip_elements name 30 28 31 29 (* RTL languages - O(1) lookup *) 32 30 let rtl_langs = 33 - let tbl = Hashtbl.create 16 in 34 - List.iter (fun l -> Hashtbl.add tbl l ()) ["ar"; "azb"; "ckb"; "dv"; "fa"; "he"; "pnb"; "ps"; "sd"; "ug"; "ur"; "iw"]; 35 - tbl 31 + Attr_utils.hashtbl_of_list ["ar"; "azb"; "ckb"; "dv"; "fa"; "he"; "pnb"; "ps"; "sd"; "ug"; "ur"; "iw"] 36 32 37 33 let is_rtl_lang lang = Hashtbl.mem rtl_langs lang 38 34
+6 -3
lib/htmlrw_check/semantic/nesting_checker.ml
··· 155 155 let map_num = special_ancestor_number "map" in 156 156 1 lsl map_num 157 157 158 - (** Transparent elements - inherit content model from parent *) 159 - let transparent_elements = ["a"; "canvas"; "video"; "audio"; "object"; "ins"; "del"; "map"] 158 + (** Transparent elements - inherit content model from parent. O(1) hashtable lookup. *) 159 + let transparent_elements_tbl = 160 + Attr_utils.hashtbl_of_list ["a"; "canvas"; "video"; "audio"; "object"; "ins"; "del"; "map"] 161 + 162 + let is_transparent_element name = Hashtbl.mem transparent_elements_tbl name 160 163 161 164 (** Stack node representing an element's context. *) 162 165 type stack_node = { ··· 334 337 in 335 338 336 339 (* Push onto stack *) 337 - let is_transparent = List.mem name transparent_elements in 340 + let is_transparent = is_transparent_element name in 338 341 let node = { ancestor_mask = state.ancestor_mask; name; is_transparent } in 339 342 state.stack <- node :: state.stack; 340 343 state.ancestor_mask <- new_mask
+14 -19
lib/htmlrw_check/specialized/aria_checker.ml
··· 9 9 specification. Abstract roles are included but should not be used 10 10 in HTML content. *) 11 11 let valid_aria_roles = 12 - let roles = [ 12 + Attr_utils.hashtbl_of_list [ 13 13 (* Document structure roles *) 14 14 (* Note: "directory" is deprecated in WAI-ARIA 1.2, use "list" instead *) 15 15 "article"; "associationlist"; "associationlistitemkey"; ··· 43 43 44 44 (* Additional roles *) 45 45 "application"; "columnheader"; "rowheader"; 46 - ] in 47 - let tbl = Hashtbl.create (List.length roles) in 48 - List.iter (fun role -> Hashtbl.add tbl role ()) roles; 49 - tbl 46 + ] 50 47 51 48 (** Roles that cannot have accessible names. 52 49 53 50 These roles must not have aria-label or aria-labelledby attributes. *) 54 51 let roles_which_cannot_be_named = 55 - let roles = [ 52 + Attr_utils.hashtbl_of_list [ 56 53 "caption"; "code"; "deletion"; "emphasis"; "generic"; "insertion"; 57 54 "mark"; "none"; "paragraph"; "presentation"; "strong"; "subscript"; 58 55 "suggestion"; "superscript" 59 - ] in 60 - let tbl = Hashtbl.create (List.length roles) in 61 - List.iter (fun role -> Hashtbl.add tbl role ()) roles; 62 - tbl 56 + ] 63 57 64 58 (** Elements whose implicit role is 'generic' and cannot have aria-label unless 65 - they have an explicit role that allows naming. *) 66 - let elements_with_generic_role = [ 67 - "a"; "abbr"; "address"; "b"; "bdi"; "bdo"; "br"; "caption"; "cite"; "code"; 68 - "colgroup"; "data"; "del"; "dfn"; "div"; "em"; "figcaption"; "hgroup"; "i"; 69 - "ins"; "kbd"; "legend"; "mark"; "p"; "pre"; "q"; "rp"; "rt"; "ruby"; "s"; 70 - "samp"; "small"; "span"; "strong"; "sub"; "sup"; "time"; "title"; "u"; "var"; 71 - "wbr" 72 - ] 59 + they have an explicit role that allows naming. O(1) lookup. *) 60 + let elements_with_generic_role = 61 + Attr_utils.hashtbl_of_list [ 62 + "a"; "abbr"; "address"; "b"; "bdi"; "bdo"; "br"; "caption"; "cite"; "code"; 63 + "colgroup"; "data"; "del"; "dfn"; "div"; "em"; "figcaption"; "hgroup"; "i"; 64 + "ins"; "kbd"; "legend"; "mark"; "p"; "pre"; "q"; "rp"; "rt"; "ruby"; "s"; 65 + "samp"; "small"; "span"; "strong"; "sub"; "sup"; "time"; "title"; "u"; "var"; 66 + "wbr" 67 + ] 73 68 74 69 (** Check if element name is a custom element (contains hyphen). *) 75 70 let is_custom_element name = ··· 90 85 if is_custom_element element_name then false 91 86 else 92 87 (* No implicit role - element has generic role unless it's interactive *) 93 - not (List.mem element_name elements_with_generic_role) 88 + not (Hashtbl.mem elements_with_generic_role element_name) 94 89 95 90 (** Map from descendant role to set of required ancestor roles. *) 96 91 let required_role_ancestor_by_descendant : (string, string list) Hashtbl.t =
+2 -9
lib/htmlrw_check/specialized/attr_restrictions_checker.ml
··· 135 135 if name_lower = "link" then begin 136 136 let has_imagesizes = Attr_utils.has_attr "imagesizes" attrs in 137 137 let has_imagesrcset = Attr_utils.has_attr "imagesrcset" attrs in 138 - let rel_value = Attr_utils.get_attr "rel" attrs in 139 138 let as_value = Attr_utils.get_attr "as" attrs in 140 139 141 140 (* imagesizes requires imagesrcset *) ··· 155 154 (* as attribute requires rel="preload" or rel="modulepreload" *) 156 155 (match as_value with 157 156 | Some _ -> 158 - let rel_is_preload = match rel_value with 159 - | Some v -> 160 - let rel_lower = String.lowercase_ascii (String.trim v) in 161 - String.length rel_lower > 0 && 162 - (List.mem "preload" (String.split_on_char ' ' rel_lower) || 163 - List.mem "modulepreload" (String.split_on_char ' ' rel_lower)) 164 - | None -> false 165 - in 157 + let rel_types = Element.get_rel_list element in 158 + let rel_is_preload = List.mem "preload" rel_types || List.mem "modulepreload" rel_types in 166 159 if not rel_is_preload then 167 160 Message_collector.add_typed collector (`Link `As_requires_preload) 168 161 | None -> ())
+1 -3
lib/htmlrw_check/specialized/label_checker.ml
··· 4 4 5 5 (** Labelable elements that label can reference - O(1) hashtable lookup *) 6 6 let labelable_elements = 7 - let tbl = Hashtbl.create 8 in 8 - List.iter (fun e -> Hashtbl.add tbl e ()) ["button"; "input"; "meter"; "output"; "progress"; "select"; "textarea"]; 9 - tbl 7 + Attr_utils.hashtbl_of_list ["button"; "input"; "meter"; "output"; "progress"; "select"; "textarea"] 10 8 11 9 let is_labelable name = Hashtbl.mem labelable_elements name 12 10