+264
bench/bench.ml
+264
bench/bench.ml
···
···
1
+
(* Simple benchmarks for HTML5 parsing and validation *)
2
+
3
+
let time_it name iterations f =
4
+
Gc.full_major ();
5
+
let start = Unix.gettimeofday () in
6
+
for _ = 1 to iterations do
7
+
ignore (f ())
8
+
done;
9
+
let elapsed = Unix.gettimeofday () -. start in
10
+
let per_iter = elapsed /. float_of_int iterations *. 1000.0 in
11
+
Printf.printf "%-40s %d iters in %.3fs (%.3f ms/iter)\n%!" name iterations elapsed per_iter
12
+
13
+
(* Generate HTML with nested elements *)
14
+
let generate_nested_html depth =
15
+
let buf = Buffer.create 4096 in
16
+
Buffer.add_string buf "<!DOCTYPE html><html><head><title>Test</title></head><body>";
17
+
for i = 1 to depth do
18
+
Buffer.add_string buf (Printf.sprintf "<div id=\"d%d\" class=\"c1 c2 c3\">" i)
19
+
done;
20
+
Buffer.add_string buf "<p>Content</p>";
21
+
for _ = 1 to depth do
22
+
Buffer.add_string buf "</div>"
23
+
done;
24
+
Buffer.add_string buf "</body></html>";
25
+
Buffer.contents buf
26
+
27
+
(* Generate HTML with many sibling elements *)
28
+
let generate_wide_html count =
29
+
let buf = Buffer.create (count * 100) in
30
+
Buffer.add_string buf "<!DOCTYPE html><html><head><title>Test</title></head><body><div>";
31
+
for i = 1 to count do
32
+
Buffer.add_string buf (Printf.sprintf "<span id=\"s%d\" class=\"cls\" data-value=\"%d\">Item %d</span>" i i i)
33
+
done;
34
+
Buffer.add_string buf "</div></body></html>";
35
+
Buffer.contents buf
36
+
37
+
(* Generate HTML with table *)
38
+
let generate_table_html rows cols =
39
+
let buf = Buffer.create (rows * cols * 50) in
40
+
Buffer.add_string buf "<!DOCTYPE html><html><head><title>Table</title></head><body><table>";
41
+
Buffer.add_string buf "<thead><tr>";
42
+
for c = 1 to cols do
43
+
Buffer.add_string buf (Printf.sprintf "<th>Col %d</th>" c)
44
+
done;
45
+
Buffer.add_string buf "</tr></thead><tbody>";
46
+
for r = 1 to rows do
47
+
Buffer.add_string buf "<tr>";
48
+
for c = 1 to cols do
49
+
Buffer.add_string buf (Printf.sprintf "<td>R%dC%d</td>" r c)
50
+
done;
51
+
Buffer.add_string buf "</tr>"
52
+
done;
53
+
Buffer.add_string buf "</tbody></table></body></html>";
54
+
Buffer.contents buf
55
+
56
+
(* Generate HTML with interactive elements (tests nesting checker) *)
57
+
let generate_interactive_html count =
58
+
let buf = Buffer.create (count * 200) in
59
+
Buffer.add_string buf "<!DOCTYPE html><html><head><title>Forms</title></head><body>";
60
+
for i = 1 to count do
61
+
Buffer.add_string buf (Printf.sprintf
62
+
"<form id=\"f%d\"><label for=\"i%d\">Label</label><input type=\"text\" id=\"i%d\" name=\"n%d\"><button type=\"submit\">Submit</button></form>" i i i i)
63
+
done;
64
+
Buffer.add_string buf "</body></html>";
65
+
Buffer.contents buf
66
+
67
+
(* Generate HTML with validation errors *)
68
+
let generate_invalid_html count =
69
+
let buf = Buffer.create (count * 100) in
70
+
Buffer.add_string buf "<!DOCTYPE html><html><head><title>Invalid</title></head><body>";
71
+
for i = 1 to count do
72
+
(* Various validation errors *)
73
+
Buffer.add_string buf (Printf.sprintf "<div role=\"invalid%d\">" i);
74
+
Buffer.add_string buf "<img>"; (* missing alt *)
75
+
Buffer.add_string buf "<a href=\"#\"><button>Nested interactive</button></a>";
76
+
Buffer.add_string buf "</div>"
77
+
done;
78
+
Buffer.add_string buf "</body></html>";
79
+
Buffer.contents buf
80
+
81
+
(* Benchmark parsing only *)
82
+
let bench_parse html iterations =
83
+
let bytes = Bytes.of_string html in
84
+
time_it "Parse only" iterations (fun () ->
85
+
Html5rw.parse_bytes bytes
86
+
)
87
+
88
+
(* Benchmark validation only (on pre-parsed DOM) *)
89
+
let bench_validate doc iterations =
90
+
time_it "Validate only" iterations (fun () ->
91
+
Htmlrw_check.check_parsed doc
92
+
)
93
+
94
+
(* Benchmark parse + validate *)
95
+
let bench_parse_and_validate html iterations =
96
+
time_it "Parse + Validate" iterations (fun () ->
97
+
Htmlrw_check.check_string html
98
+
)
99
+
100
+
(* Benchmark repeated message access (tests caching via check result) *)
101
+
let bench_message_access result iterations =
102
+
time_it "Message access (10x errors/warnings)" iterations (fun () ->
103
+
for _ = 1 to 10 do
104
+
ignore (Htmlrw_check.errors result);
105
+
ignore (Htmlrw_check.warnings result);
106
+
ignore (Htmlrw_check.messages result)
107
+
done
108
+
)
109
+
110
+
(* Micro-benchmarks for specific optimizations *)
111
+
112
+
(* Test List.mem vs Hashtbl.mem for table element lookups *)
113
+
let bench_list_mem iterations =
114
+
let elements = ["table"; "tbody"; "tfoot"; "thead"; "tr"] in
115
+
let test_names = ["table"; "div"; "tbody"; "span"; "tr"; "p"; "thead"; "a"] in
116
+
time_it "List.mem (5 elements, 8 lookups)" iterations (fun () ->
117
+
List.iter (fun name ->
118
+
ignore (List.mem name elements)
119
+
) test_names
120
+
)
121
+
122
+
let bench_hashtbl_mem iterations =
123
+
let tbl = Hashtbl.create 8 in
124
+
List.iter (fun e -> Hashtbl.add tbl e ()) ["table"; "tbody"; "tfoot"; "thead"; "tr"];
125
+
let test_names = ["table"; "div"; "tbody"; "span"; "tr"; "p"; "thead"; "a"] in
126
+
time_it "Hashtbl.mem (5 elements, 8 lookups)" iterations (fun () ->
127
+
List.iter (fun name ->
128
+
ignore (Hashtbl.mem tbl name)
129
+
) test_names
130
+
)
131
+
132
+
(* Test noncharacter validation - bitwise vs List.mem *)
133
+
let bench_nonchar_list iterations =
134
+
let nonchars = [0xFFFE; 0xFFFF; 0x1FFFE; 0x1FFFF; 0x2FFFE; 0x2FFFF;
135
+
0x3FFFE; 0x3FFFF; 0x4FFFE; 0x4FFFF; 0x5FFFE; 0x5FFFF;
136
+
0x6FFFE; 0x6FFFF; 0x7FFFE; 0x7FFFF; 0x8FFFE; 0x8FFFF;
137
+
0x9FFFE; 0x9FFFF; 0xAFFFE; 0xAFFFF; 0xBFFFE; 0xBFFFF;
138
+
0xCFFFE; 0xCFFFF; 0xDFFFE; 0xDFFFF; 0xEFFFE; 0xEFFFF;
139
+
0xFFFFE; 0xFFFFF; 0x10FFFE; 0x10FFFF] in
140
+
let test_codes = [0x41; 0xFFFE; 0x1000; 0x10FFFF; 0xFDD0; 0x3FFFE; 0x100; 0xFFFF] in
141
+
time_it "List.mem nonchar (34 elements, 8 checks)" iterations (fun () ->
142
+
List.iter (fun code ->
143
+
ignore (List.mem code nonchars)
144
+
) test_codes
145
+
)
146
+
147
+
let bench_nonchar_bitwise iterations =
148
+
let test_codes = [0x41; 0xFFFE; 0x1000; 0x10FFFF; 0xFDD0; 0x3FFFE; 0x100; 0xFFFF] in
149
+
time_it "Bitwise nonchar (8 checks)" iterations (fun () ->
150
+
List.iter (fun code ->
151
+
let low16 = code land 0xFFFF in
152
+
ignore (low16 = 0xFFFE || low16 = 0xFFFF)
153
+
) test_codes
154
+
)
155
+
156
+
(* Test message list reversal - once vs cached *)
157
+
let bench_list_rev iterations =
158
+
let msgs = List.init 100 (fun i -> Printf.sprintf "Message %d" i) in
159
+
time_it "List.rev (100 msgs, 10 accesses)" iterations (fun () ->
160
+
for _ = 1 to 10 do
161
+
ignore (List.rev msgs)
162
+
done
163
+
)
164
+
165
+
let bench_cached_rev iterations =
166
+
let msgs = List.init 100 (fun i -> Printf.sprintf "Message %d" i) in
167
+
let cached = ref None in
168
+
time_it "Cached rev (100 msgs, 10 accesses)" iterations (fun () ->
169
+
cached := None; (* Reset cache each iteration *)
170
+
for _ = 1 to 10 do
171
+
match !cached with
172
+
| Some c -> ignore c
173
+
| None ->
174
+
let rev = List.rev msgs in
175
+
cached := Some rev;
176
+
ignore rev
177
+
done
178
+
)
179
+
180
+
let () =
181
+
Printf.printf "\n=== HTML5 Parser/Validator Benchmarks ===\n\n";
182
+
183
+
(* Micro-benchmarks first *)
184
+
Printf.printf "--- Micro-benchmarks: Optimized Code Paths ---\n";
185
+
Printf.printf "\nTable element lookup (List.mem vs Hashtbl.mem):\n";
186
+
bench_list_mem 100000;
187
+
bench_hashtbl_mem 100000;
188
+
189
+
Printf.printf "\nNoncharacter validation (List.mem vs bitwise):\n";
190
+
bench_nonchar_list 100000;
191
+
bench_nonchar_bitwise 100000;
192
+
193
+
Printf.printf "\nMessage list reversal (repeated vs cached):\n";
194
+
bench_list_rev 10000;
195
+
bench_cached_rev 10000;
196
+
Printf.printf "\n";
197
+
198
+
(* Small documents *)
199
+
Printf.printf "--- Small Document (100 nested divs) ---\n";
200
+
let small_nested = generate_nested_html 100 in
201
+
Printf.printf "Document size: %d bytes\n" (String.length small_nested);
202
+
bench_parse small_nested 1000;
203
+
let doc = Html5rw.parse_bytes (Bytes.of_string small_nested) in
204
+
bench_validate doc 1000;
205
+
bench_parse_and_validate small_nested 1000;
206
+
Printf.printf "\n";
207
+
208
+
(* Wide documents *)
209
+
Printf.printf "--- Wide Document (1000 siblings) ---\n";
210
+
let wide = generate_wide_html 1000 in
211
+
Printf.printf "Document size: %d bytes\n" (String.length wide);
212
+
bench_parse wide 100;
213
+
let doc = Html5rw.parse_bytes (Bytes.of_string wide) in
214
+
bench_validate doc 100;
215
+
bench_parse_and_validate wide 100;
216
+
Printf.printf "\n";
217
+
218
+
(* Table documents *)
219
+
Printf.printf "--- Table Document (100x20) ---\n";
220
+
let table = generate_table_html 100 20 in
221
+
Printf.printf "Document size: %d bytes\n" (String.length table);
222
+
bench_parse table 100;
223
+
let doc = Html5rw.parse_bytes (Bytes.of_string table) in
224
+
bench_validate doc 100;
225
+
bench_parse_and_validate table 100;
226
+
Printf.printf "\n";
227
+
228
+
(* Interactive elements (nesting checker stress test) *)
229
+
Printf.printf "--- Interactive Elements (200 forms) ---\n";
230
+
let interactive = generate_interactive_html 200 in
231
+
Printf.printf "Document size: %d bytes\n" (String.length interactive);
232
+
bench_parse interactive 100;
233
+
let doc = Html5rw.parse_bytes (Bytes.of_string interactive) in
234
+
bench_validate doc 100;
235
+
bench_parse_and_validate interactive 100;
236
+
Printf.printf "\n";
237
+
238
+
(* Invalid HTML (validation error generation) *)
239
+
Printf.printf "--- Invalid HTML (100 error patterns) ---\n";
240
+
let invalid = generate_invalid_html 100 in
241
+
Printf.printf "Document size: %d bytes\n" (String.length invalid);
242
+
bench_parse invalid 100;
243
+
let doc = Html5rw.parse_bytes (Bytes.of_string invalid) in
244
+
bench_validate doc 100;
245
+
bench_parse_and_validate invalid 100;
246
+
Printf.printf "\n";
247
+
248
+
(* Message access benchmark - tests caching *)
249
+
Printf.printf "--- Message Access (invalid HTML result, 10 accesses) ---\n";
250
+
let result = Htmlrw_check.check_string invalid in
251
+
bench_message_access result 1000;
252
+
Printf.printf "\n";
253
+
254
+
(* Large document *)
255
+
Printf.printf "--- Large Document (500 nested divs) ---\n";
256
+
let large_nested = generate_nested_html 500 in
257
+
Printf.printf "Document size: %d bytes\n" (String.length large_nested);
258
+
bench_parse large_nested 100;
259
+
let doc = Html5rw.parse_bytes (Bytes.of_string large_nested) in
260
+
bench_validate doc 100;
261
+
bench_parse_and_validate large_nested 100;
262
+
Printf.printf "\n";
263
+
264
+
Printf.printf "=== Benchmarks Complete ===\n"
+47
lib/html5rw/dom/dom.mli
+47
lib/html5rw/dom/dom.mli
···
737
val has_attr : node -> string -> bool
738
(** [has_attr node name] returns [true] if the node has attribute [name]. *)
739
740
+
(** {1 Space-Separated Attribute Values}
741
+
742
+
Many HTML attributes contain space-separated lists of values. For example,
743
+
the [class] attribute contains CSS class names: [class="header main active"].
744
+
These functions parse such attributes into OCaml lists.
745
+
746
+
Per the HTML5 spec, "ASCII whitespace" (space, tab, newline, carriage return,
747
+
form feed) is used as the separator.
748
+
*)
749
+
750
+
val split_on_whitespace : string -> string list
751
+
(** [split_on_whitespace s] splits a string on ASCII whitespace.
752
+
753
+
This implements the HTML5 "split on ASCII whitespace" algorithm.
754
+
755
+
{b Example:}
756
+
{[
757
+
split_on_whitespace "foo bar\tbaz"
758
+
(* Returns: ["foo"; "bar"; "baz"] *)
759
+
]}
760
+
*)
761
+
762
+
val get_attr_list : node -> string -> string list
763
+
(** [get_attr_list node name] returns a space-separated attribute as a list.
764
+
765
+
Returns an empty list if the attribute doesn't exist.
766
+
*)
767
+
768
+
val get_class_list : node -> string list
769
+
(** [get_class_list node] returns the class attribute as a list of class names. *)
770
+
771
+
val get_rel_list : node -> string list
772
+
(** [get_rel_list node] returns the rel attribute as a list of link types
773
+
(lowercased since they are case-insensitive). *)
774
+
775
+
val get_headers_list : node -> string list
776
+
(** [get_headers_list node] returns the headers attribute as a list of IDs. *)
777
+
778
+
val get_itemref_list : node -> string list
779
+
(** [get_itemref_list node] returns the itemref attribute as a list of IDs. *)
780
+
781
+
val get_itemprop_list : node -> string list
782
+
(** [get_itemprop_list node] returns the itemprop attribute as a list. *)
783
+
784
+
val get_itemtype_list : node -> string list
785
+
(** [get_itemtype_list node] returns the itemtype attribute as a list of URLs. *)
786
+
787
(** {1 Location Helpers} *)
788
789
val make_location : line:int -> column:int -> ?end_line:int -> ?end_column:int ->
+33
lib/html5rw/dom/dom_node.ml
+33
lib/html5rw/dom/dom_node.ml
···
147
148
let has_attr node name = List.mem_assoc name node.attrs
149
150
+
(* Whitespace splitting for space-separated attribute values per HTML5 spec.
151
+
Handles ASCII whitespace: space, tab, newline, carriage return, form feed *)
152
+
let split_on_whitespace s =
153
+
let is_whitespace = function
154
+
| ' ' | '\t' | '\n' | '\r' | '\x0c' -> true
155
+
| _ -> false
156
+
in
157
+
let len = String.length s in
158
+
let rec find_start acc i =
159
+
if i >= len then List.rev acc
160
+
else if is_whitespace s.[i] then find_start acc (i + 1)
161
+
else find_end acc i (i + 1)
162
+
and find_end acc start i =
163
+
if i >= len then List.rev (String.sub s start (i - start) :: acc)
164
+
else if is_whitespace s.[i] then find_start (String.sub s start (i - start) :: acc) (i + 1)
165
+
else find_end acc start (i + 1)
166
+
in
167
+
find_start [] 0
168
+
169
+
(* Get space-separated attribute as list *)
170
+
let get_attr_list node name =
171
+
match get_attr node name with
172
+
| Some s -> split_on_whitespace s
173
+
| None -> []
174
+
175
+
(* Common space-separated attribute accessors *)
176
+
let get_class_list node = get_attr_list node "class"
177
+
let get_rel_list node = List.map String.lowercase_ascii (get_attr_list node "rel")
178
+
let get_headers_list node = get_attr_list node "headers"
179
+
let get_itemref_list node = get_attr_list node "itemref"
180
+
let get_itemprop_list node = get_attr_list node "itemprop"
181
+
let get_itemtype_list node = get_attr_list node "itemtype"
182
+
183
(* Tree traversal *)
184
let rec descendants node =
185
List.concat_map (fun n -> n :: descendants n) node.children
+86
lib/html5rw/dom/dom_node.mli
+86
lib/html5rw/dom/dom_node.mli
···
740
val has_attr : node -> string -> bool
741
(** [has_attr node name] returns [true] if the node has attribute [name]. *)
742
743
+
(** {1 Space-Separated Attribute Values}
744
+
745
+
Many HTML attributes contain space-separated lists of values. For example,
746
+
the [class] attribute contains CSS class names: [class="header main active"].
747
+
These functions parse such attributes into OCaml lists.
748
+
749
+
Per the HTML5 spec, "ASCII whitespace" (space, tab, newline, carriage return,
750
+
form feed) is used as the separator.
751
+
*)
752
+
753
+
val split_on_whitespace : string -> string list
754
+
(** [split_on_whitespace s] splits a string on ASCII whitespace.
755
+
756
+
This implements the HTML5 "split on ASCII whitespace" algorithm used
757
+
for parsing space-separated attribute values.
758
+
759
+
{b Example:}
760
+
{[
761
+
split_on_whitespace "foo bar\tbaz"
762
+
(* Returns: ["foo"; "bar"; "baz"] *)
763
+
]}
764
+
*)
765
+
766
+
val get_attr_list : node -> string -> string list
767
+
(** [get_attr_list node name] returns a space-separated attribute as a list.
768
+
769
+
Returns an empty list if the attribute doesn't exist.
770
+
771
+
{b Example:}
772
+
{[
773
+
(* For <div class="foo bar baz"> *)
774
+
get_attr_list div "class"
775
+
(* Returns: ["foo"; "bar"; "baz"] *)
776
+
]}
777
+
*)
778
+
779
+
val get_class_list : node -> string list
780
+
(** [get_class_list node] returns the class attribute as a list of class names.
781
+
782
+
Equivalent to [get_attr_list node "class"].
783
+
784
+
{b Example:}
785
+
{[
786
+
(* For <div class="container main"> *)
787
+
get_class_list div
788
+
(* Returns: ["container"; "main"] *)
789
+
]}
790
+
*)
791
+
792
+
val get_rel_list : node -> string list
793
+
(** [get_rel_list node] returns the rel attribute as a list of link types.
794
+
795
+
Link types are lowercased since they are case-insensitive per HTML5 spec.
796
+
797
+
{b Example:}
798
+
{[
799
+
(* For <link rel="stylesheet preload"> *)
800
+
get_rel_list link
801
+
(* Returns: ["stylesheet"; "preload"] *)
802
+
]}
803
+
*)
804
+
805
+
val get_headers_list : node -> string list
806
+
(** [get_headers_list node] returns the headers attribute as a list of IDs.
807
+
808
+
Used on [<td>] and [<th>] elements to associate cells with headers.
809
+
*)
810
+
811
+
val get_itemref_list : node -> string list
812
+
(** [get_itemref_list node] returns the itemref attribute as a list of IDs.
813
+
814
+
Used for microdata to reference elements by ID.
815
+
*)
816
+
817
+
val get_itemprop_list : node -> string list
818
+
(** [get_itemprop_list node] returns the itemprop attribute as a list.
819
+
820
+
Used for microdata property names.
821
+
*)
822
+
823
+
val get_itemtype_list : node -> string list
824
+
(** [get_itemtype_list node] returns the itemtype attribute as a list of URLs.
825
+
826
+
Used for microdata type URLs.
827
+
*)
828
+
829
(** {1 Location Helpers}
830
831
Functions to manage source location information for nodes.
+10
-6
lib/html5rw/dom/dom_serialize.ml
+10
-6
lib/html5rw/dom/dom_serialize.ml
···
8
open Bytesrw
9
open Dom_node
10
11
-
(* Void elements that don't have end tags *)
12
-
let void_elements = [
13
-
"area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input";
14
-
"link"; "meta"; "source"; "track"; "wbr"
15
-
]
16
17
-
let is_void name = List.mem name void_elements
18
19
(* Foreign attribute adjustments for test output *)
20
let foreign_attr_adjustments = [
···
8
open Bytesrw
9
open Dom_node
10
11
+
(* Void elements that don't have end tags - O(1) hashtable lookup *)
12
+
let void_elements_tbl =
13
+
let elements = [
14
+
"area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input";
15
+
"link"; "meta"; "source"; "track"; "wbr"
16
+
] in
17
+
let tbl = Hashtbl.create (List.length elements) in
18
+
List.iter (fun e -> Hashtbl.add tbl e ()) elements;
19
+
tbl
20
21
+
let is_void name = Hashtbl.mem void_elements_tbl name
22
23
(* Foreign attribute adjustments for test output *)
24
let foreign_attr_adjustments = [
+32
-8
lib/html5rw/parser/parser_constants.ml
+32
-8
lib/html5rw/parser/parser_constants.ml
···
80
let mathml_text_integration = ["mi"; "mo"; "mn"; "ms"; "mtext"]
81
let mathml_text_integration_tbl = make_set mathml_text_integration
82
83
-
(* MathML attribute adjustments *)
84
-
let mathml_attr_adjustments = [
85
-
("definitionurl", "definitionURL")
86
-
]
87
88
let adjust_mathml_attrs attrs =
89
List.map (fun (k, v) ->
90
-
match List.assoc_opt (lowercase k) mathml_attr_adjustments with
91
| Some adjusted_k -> (adjusted_k, v)
92
| None -> (k, v)
93
) attrs
···
95
(* SVG HTML integration points *)
96
let svg_html_integration = ["foreignObject"; "desc"; "title"]
97
let svg_html_integration_tbl = make_set (List.map lowercase svg_html_integration)
98
99
(* SVG tag name adjustments *)
100
let svg_tag_adjustments = [
···
136
("radialgradient", "radialGradient");
137
("textpath", "textPath");
138
]
139
140
(* SVG attribute adjustments *)
141
let svg_attr_adjustments = [
···
198
("ychannelselector", "yChannelSelector");
199
("zoomandpan", "zoomAndPan");
200
]
201
202
(* Foreign attribute adjustments *)
203
let foreign_attr_adjustments = [
···
213
("xmlns", ("", "xmlns", "http://www.w3.org/2000/xmlns/"));
214
("xmlns:xlink", ("xmlns", "xlink", "http://www.w3.org/2000/xmlns/"));
215
]
216
217
(* Quirks mode detection *)
218
let quirky_public_matches = [
···
293
"http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"
294
]
295
296
(* Helper functions - O(1) hashtable lookups *)
297
let is_void_element name = Hashtbl.mem void_elements_tbl name
298
let is_formatting_element name = Hashtbl.mem formatting_elements_tbl name
···
303
let is_mathml_text_integration name = Hashtbl.mem mathml_text_integration_tbl name
304
let is_svg_html_integration name = Hashtbl.mem svg_html_integration_tbl (lowercase name)
305
let is_select_scope_exclude name = Hashtbl.mem select_scope_exclude_tbl name
306
307
(* Backwards compatibility aliases *)
308
let is_void = List.mem
···
311
let is_heading = List.mem
312
313
let adjust_svg_tag_name name =
314
-
match List.assoc_opt (lowercase name) svg_tag_adjustments with
315
| Some adjusted -> adjusted
316
| None -> name
317
318
let adjust_svg_attrs attrs =
319
List.map (fun (name, value) ->
320
let adjusted_name =
321
-
match List.assoc_opt (lowercase name) svg_attr_adjustments with
322
| Some n -> n
323
| None -> name
324
in
···
327
328
let adjust_foreign_attrs attrs =
329
List.map (fun (name, value) ->
330
-
match List.assoc_opt (lowercase name) foreign_attr_adjustments with
331
| Some (prefix, local, _ns) ->
332
if prefix = "" then (local, value)
333
else (prefix ^ ":" ^ local, value)
···
80
let mathml_text_integration = ["mi"; "mo"; "mn"; "ms"; "mtext"]
81
let mathml_text_integration_tbl = make_set mathml_text_integration
82
83
+
(* MathML attribute adjustments - O(1) hashtable lookup *)
84
+
let mathml_attr_adjustments_tbl =
85
+
let adjustments = [("definitionurl", "definitionURL")] in
86
+
let tbl = Hashtbl.create 4 in
87
+
List.iter (fun (k, v) -> Hashtbl.add tbl k v) adjustments;
88
+
tbl
89
90
let adjust_mathml_attrs attrs =
91
List.map (fun (k, v) ->
92
+
match Hashtbl.find_opt mathml_attr_adjustments_tbl (lowercase k) with
93
| Some adjusted_k -> (adjusted_k, v)
94
| None -> (k, v)
95
) attrs
···
97
(* SVG HTML integration points *)
98
let svg_html_integration = ["foreignObject"; "desc"; "title"]
99
let svg_html_integration_tbl = make_set (List.map lowercase svg_html_integration)
100
+
101
+
(* Helper to create hashtable from association list for O(1) lookup *)
102
+
let make_assoc_tbl pairs =
103
+
let tbl = Hashtbl.create (List.length pairs) in
104
+
List.iter (fun (k, v) -> Hashtbl.add tbl k v) pairs;
105
+
tbl
106
107
(* SVG tag name adjustments *)
108
let svg_tag_adjustments = [
···
144
("radialgradient", "radialGradient");
145
("textpath", "textPath");
146
]
147
+
let svg_tag_adjustments_tbl = make_assoc_tbl svg_tag_adjustments
148
149
(* SVG attribute adjustments *)
150
let svg_attr_adjustments = [
···
207
("ychannelselector", "yChannelSelector");
208
("zoomandpan", "zoomAndPan");
209
]
210
+
let svg_attr_adjustments_tbl = make_assoc_tbl svg_attr_adjustments
211
212
(* Foreign attribute adjustments *)
213
let foreign_attr_adjustments = [
···
223
("xmlns", ("", "xmlns", "http://www.w3.org/2000/xmlns/"));
224
("xmlns:xlink", ("xmlns", "xlink", "http://www.w3.org/2000/xmlns/"));
225
]
226
+
let foreign_attr_adjustments_tbl = make_assoc_tbl foreign_attr_adjustments
227
228
(* Quirks mode detection *)
229
let quirky_public_matches = [
···
304
"http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"
305
]
306
307
+
(* Table-related element sets for tree builder O(1) lookups *)
308
+
let table_section_elements = ["tbody"; "thead"; "tfoot"]
309
+
let table_section_elements_tbl = make_set table_section_elements
310
+
311
+
let table_cell_elements = ["td"; "th"]
312
+
let table_cell_elements_tbl = make_set table_cell_elements
313
+
314
+
let foster_parenting_elements = ["table"; "tbody"; "tfoot"; "thead"; "tr"]
315
+
let foster_parenting_elements_tbl = make_set foster_parenting_elements
316
+
317
(* Helper functions - O(1) hashtable lookups *)
318
let is_void_element name = Hashtbl.mem void_elements_tbl name
319
let is_formatting_element name = Hashtbl.mem formatting_elements_tbl name
···
324
let is_mathml_text_integration name = Hashtbl.mem mathml_text_integration_tbl name
325
let is_svg_html_integration name = Hashtbl.mem svg_html_integration_tbl (lowercase name)
326
let is_select_scope_exclude name = Hashtbl.mem select_scope_exclude_tbl name
327
+
let is_table_section_element name = Hashtbl.mem table_section_elements_tbl name
328
+
let is_table_cell_element name = Hashtbl.mem table_cell_elements_tbl name
329
+
let is_foster_parenting_element name = Hashtbl.mem foster_parenting_elements_tbl name
330
331
(* Backwards compatibility aliases *)
332
let is_void = List.mem
···
335
let is_heading = List.mem
336
337
let adjust_svg_tag_name name =
338
+
match Hashtbl.find_opt svg_tag_adjustments_tbl (lowercase name) with
339
| Some adjusted -> adjusted
340
| None -> name
341
342
let adjust_svg_attrs attrs =
343
List.map (fun (name, value) ->
344
let adjusted_name =
345
+
match Hashtbl.find_opt svg_attr_adjustments_tbl (lowercase name) with
346
| Some n -> n
347
| None -> name
348
in
···
351
352
let adjust_foreign_attrs attrs =
353
List.map (fun (name, value) ->
354
+
match Hashtbl.find_opt foreign_attr_adjustments_tbl (lowercase name) with
355
| Some (prefix, local, _ns) ->
356
if prefix = "" then (local, value)
357
else (prefix ^ ":" ^ local, value)
+11
-11
lib/html5rw/parser/parser_tree_builder.ml
+11
-11
lib/html5rw/parser/parser_tree_builder.ml
···
91
(* Set initial mode based on context *)
92
t.mode <- (
93
if name = "html" then Parser_insertion_mode.Before_head
94
-
else if List.mem name ["tbody"; "thead"; "tfoot"] && (ns = None || ns = Some "html") then
95
Parser_insertion_mode.In_table_body
96
else if name = "tr" && (ns = None || ns = Some "html") then
97
Parser_insertion_mode.In_row
98
-
else if List.mem name ["td"; "th"] && (ns = None || ns = Some "html") then
99
Parser_insertion_mode.In_cell
100
else if name = "caption" && (ns = None || ns = Some "html") then
101
Parser_insertion_mode.In_caption
···
160
match current_node t with
161
| None -> (t.document, None)
162
| Some target ->
163
-
if t.foster_parenting && List.mem target.Dom.name ["table"; "tbody"; "tfoot"; "thead"; "tr"] then begin
164
(* Foster parenting per WHATWG spec *)
165
(* Step 1: Find last (most recent) template and table in stack *)
166
(* Note: index 0 = top of stack = most recently added *)
···
599
| Some p -> Dom.remove_child p !last_node
600
| None -> ());
601
(* Check if we need foster parenting *)
602
-
if t.foster_parenting && List.mem ca.Dom.name ["table"; "tbody"; "tfoot"; "thead"; "tr"] then begin
603
(* Find table and insert before it *)
604
let rec find_table = function
605
| [] -> None
···
698
end;
699
if t.mode <> Parser_insertion_mode.In_select_in_table then
700
t.mode <- Parser_insertion_mode.In_select
701
-
end else if List.mem name ["td"; "th"] && not is_last then
702
t.mode <- Parser_insertion_mode.In_cell
703
else if name = "tr" then
704
t.mode <- Parser_insertion_mode.In_row
705
-
else if List.mem name ["tbody"; "thead"; "tfoot"] then
706
t.mode <- Parser_insertion_mode.In_table_body
707
else if name = "caption" then
708
t.mode <- Parser_insertion_mode.In_caption
···
1473
1474
and process_in_table t token =
1475
match token with
1476
-
| Token.Character _ when (match current_node t with Some n -> List.mem n.Dom.name ["table"; "tbody"; "tfoot"; "thead"; "tr"] | None -> false) ->
1477
t.pending_table_chars <- [];
1478
t.original_mode <- Some t.mode;
1479
t.mode <- Parser_insertion_mode.In_table_text;
···
1798
1799
and process_in_cell t token =
1800
match token with
1801
-
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["td"; "th"] ->
1802
if not (has_element_in_table_scope t name) then
1803
parse_error t "unexpected-end-tag"
1804
else begin
···
1822
when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"] ->
1823
parse_error t "unexpected-end-tag"
1824
| Token.Tag { kind = Token.End; name; _ }
1825
-
when List.mem name ["table"; "tbody"; "tfoot"; "thead"; "tr"] ->
1826
if not (has_element_in_table_scope t name) then
1827
parse_error t "unexpected-end-tag"
1828
else begin
···
1835
and close_cell t =
1836
generate_implied_end_tags t ();
1837
(match current_node t with
1838
-
| Some n when not (List.mem n.Dom.name ["td"; "th"] && is_in_html_namespace n) -> parse_error t "end-tag-too-early"
1839
| _ -> ());
1840
pop_until_html_one_of t ["td"; "th"];
1841
clear_active_formatting_to_marker t;
···
2050
t.template_modes <- Parser_insertion_mode.In_table_body :: t.template_modes;
2051
t.mode <- Parser_insertion_mode.In_table_body;
2052
process_token t token
2053
-
| Token.Tag { kind = Token.Start; name; _ } when List.mem name ["td"; "th"] ->
2054
t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2055
t.template_modes <- Parser_insertion_mode.In_row :: t.template_modes;
2056
t.mode <- Parser_insertion_mode.In_row;
···
91
(* Set initial mode based on context *)
92
t.mode <- (
93
if name = "html" then Parser_insertion_mode.Before_head
94
+
else if Parser_constants.is_table_section_element name && (ns = None || ns = Some "html") then
95
Parser_insertion_mode.In_table_body
96
else if name = "tr" && (ns = None || ns = Some "html") then
97
Parser_insertion_mode.In_row
98
+
else if Parser_constants.is_table_cell_element name && (ns = None || ns = Some "html") then
99
Parser_insertion_mode.In_cell
100
else if name = "caption" && (ns = None || ns = Some "html") then
101
Parser_insertion_mode.In_caption
···
160
match current_node t with
161
| None -> (t.document, None)
162
| Some target ->
163
+
if t.foster_parenting && Parser_constants.is_foster_parenting_element target.Dom.name then begin
164
(* Foster parenting per WHATWG spec *)
165
(* Step 1: Find last (most recent) template and table in stack *)
166
(* Note: index 0 = top of stack = most recently added *)
···
599
| Some p -> Dom.remove_child p !last_node
600
| None -> ());
601
(* Check if we need foster parenting *)
602
+
if t.foster_parenting && Parser_constants.is_foster_parenting_element ca.Dom.name then begin
603
(* Find table and insert before it *)
604
let rec find_table = function
605
| [] -> None
···
698
end;
699
if t.mode <> Parser_insertion_mode.In_select_in_table then
700
t.mode <- Parser_insertion_mode.In_select
701
+
end else if Parser_constants.is_table_cell_element name && not is_last then
702
t.mode <- Parser_insertion_mode.In_cell
703
else if name = "tr" then
704
t.mode <- Parser_insertion_mode.In_row
705
+
else if Parser_constants.is_table_section_element name then
706
t.mode <- Parser_insertion_mode.In_table_body
707
else if name = "caption" then
708
t.mode <- Parser_insertion_mode.In_caption
···
1473
1474
and process_in_table t token =
1475
match token with
1476
+
| Token.Character _ when (match current_node t with Some n -> Parser_constants.is_foster_parenting_element n.Dom.name | None -> false) ->
1477
t.pending_table_chars <- [];
1478
t.original_mode <- Some t.mode;
1479
t.mode <- Parser_insertion_mode.In_table_text;
···
1798
1799
and process_in_cell t token =
1800
match token with
1801
+
| Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_table_cell_element name ->
1802
if not (has_element_in_table_scope t name) then
1803
parse_error t "unexpected-end-tag"
1804
else begin
···
1822
when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"] ->
1823
parse_error t "unexpected-end-tag"
1824
| Token.Tag { kind = Token.End; name; _ }
1825
+
when Parser_constants.is_foster_parenting_element name ->
1826
if not (has_element_in_table_scope t name) then
1827
parse_error t "unexpected-end-tag"
1828
else begin
···
1835
and close_cell t =
1836
generate_implied_end_tags t ();
1837
(match current_node t with
1838
+
| Some n when not (Parser_constants.is_table_cell_element n.Dom.name && is_in_html_namespace n) -> parse_error t "end-tag-too-early"
1839
| _ -> ());
1840
pop_until_html_one_of t ["td"; "th"];
1841
clear_active_formatting_to_marker t;
···
2050
t.template_modes <- Parser_insertion_mode.In_table_body :: t.template_modes;
2051
t.mode <- Parser_insertion_mode.In_table_body;
2052
process_token t token
2053
+
| Token.Tag { kind = Token.Start; name; _ } when Parser_constants.is_table_cell_element name ->
2054
t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2055
t.template_modes <- Parser_insertion_mode.In_row :: t.template_modes;
2056
t.mode <- Parser_insertion_mode.In_row;
+3
-6
lib/html5rw/tokenizer/tokenizer_impl.ml
+3
-6
lib/html5rw/tokenizer/tokenizer_impl.ml
···
1943
error t (Printf.sprintf "surrogate-character-reference:%04x" code);
1944
replacement_char
1945
end else if (code >= 0xFDD0 && code <= 0xFDEF) ||
1946
-
List.mem code [0xFFFE; 0xFFFF; 0x1FFFE; 0x1FFFF; 0x2FFFE; 0x2FFFF;
1947
-
0x3FFFE; 0x3FFFF; 0x4FFFE; 0x4FFFF; 0x5FFFE; 0x5FFFF;
1948
-
0x6FFFE; 0x6FFFF; 0x7FFFE; 0x7FFFF; 0x8FFFE; 0x8FFFF;
1949
-
0x9FFFE; 0x9FFFF; 0xAFFFE; 0xAFFFF; 0xBFFFE; 0xBFFFF;
1950
-
0xCFFFE; 0xCFFFF; 0xDFFFE; 0xDFFFF; 0xEFFFE; 0xEFFFF;
1951
-
0xFFFFE; 0xFFFFF; 0x10FFFE; 0x10FFFF] then begin
1952
error t (Printf.sprintf "noncharacter-character-reference:%05x" code);
1953
Entities.Numeric_ref.codepoint_to_utf8 code
1954
end else if (code >= 0x01 && code <= 0x08) || code = 0x0B ||
···
1943
error t (Printf.sprintf "surrogate-character-reference:%04x" code);
1944
replacement_char
1945
end else if (code >= 0xFDD0 && code <= 0xFDEF) ||
1946
+
(* Noncharacters end in 0xFFFE or 0xFFFF in each plane (0-16).
1947
+
O(1) bitwise check instead of O(n) list membership. *)
1948
+
(let low16 = code land 0xFFFF in low16 = 0xFFFE || low16 = 0xFFFF) then begin
1949
error t (Printf.sprintf "noncharacter-character-reference:%05x" code);
1950
Entities.Numeric_ref.codepoint_to_utf8 code
1951
end else if (code >= 0x01 && code <= 0x08) || code = 0x0B ||
+3
-21
lib/htmlrw_check/datatype/datatype.ml
+3
-21
lib/htmlrw_check/datatype/datatype.ml
···
42
else String.sub s start (end_pos - start + 1)
43
44
(** Split string on HTML whitespace characters (space, tab, LF, FF, CR).
45
-
Filters out empty tokens. Used for space-separated attribute values. *)
46
-
let split_on_whitespace s =
47
-
let len = String.length s in
48
-
let rec split acc start i =
49
-
if i >= len then
50
-
if i > start then
51
-
List.rev ((String.sub s start (i - start)) :: acc)
52
-
else
53
-
List.rev acc
54
-
else if is_whitespace s.[i] then
55
-
let acc' =
56
-
if i > start then
57
-
(String.sub s start (i - start)) :: acc
58
-
else
59
-
acc
60
-
in
61
-
split acc' (i + 1) (i + 1)
62
-
else
63
-
split acc start (i + 1)
64
-
in
65
-
split [] 0 0
66
67
(** Factory for creating enum-based validators.
68
Many HTML attributes accept a fixed set of keyword values.
···
42
else String.sub s start (end_pos - start + 1)
43
44
(** Split string on HTML whitespace characters (space, tab, LF, FF, CR).
45
+
Filters out empty tokens. Used for space-separated attribute values.
46
+
Delegates to the core library implementation. *)
47
+
let split_on_whitespace = Html5rw.Dom.split_on_whitespace
48
49
(** Factory for creating enum-based validators.
50
Many HTML attributes accept a fixed set of keyword values.
+2
-2
lib/htmlrw_check/element/attr.ml
+2
-2
lib/htmlrw_check/element/attr.ml
···
872
let get_rel attrs =
873
List.find_map (function `Rel s -> Some s | _ -> None) attrs
874
875
-
(** Get rel attribute as list of link types (space-separated) *)
876
let get_rel_list attrs =
877
match get_rel attrs with
878
-
| Some s -> Datatype.split_on_whitespace s
879
| None -> []
880
881
(** Get headers attribute as raw string *)
···
872
let get_rel attrs =
873
List.find_map (function `Rel s -> Some s | _ -> None) attrs
874
875
+
(** Get rel attribute as list of link types (space-separated, lowercased per HTML5 spec) *)
876
let get_rel_list attrs =
877
match get_rel attrs with
878
+
| Some s -> List.map String.lowercase_ascii (Datatype.split_on_whitespace s)
879
| None -> []
880
881
(** Get headers attribute as raw string *)
+9
lib/htmlrw_check/element/element.ml
+9
lib/htmlrw_check/element/element.ml
···
104
let get_all_aria elem = Attr.get_all_aria elem.attrs
105
let get_all_data elem = Attr.get_all_data elem.attrs
106
107
+
(** Space-separated list getters *)
108
+
let get_class_list elem = Attr.get_class_list elem.attrs
109
+
let get_rel_list elem = Attr.get_rel_list elem.attrs
110
+
let get_headers_list elem = Attr.get_headers_list elem.attrs
111
+
let get_itemref_list elem = Attr.get_itemref_list elem.attrs
112
+
let get_itemprop_list elem = Attr.get_itemprop_list elem.attrs
113
+
let get_itemtype_list elem = Attr.get_itemtype_list elem.attrs
114
+
let get_aria_list name elem = Attr.get_aria_list name elem.attrs
115
+
116
(** {1 Category Checks} *)
117
118
(** Check if this is a void element *)
+26
lib/htmlrw_check/element/element.mli
+26
lib/htmlrw_check/element/element.mli
···
162
val get_all_data : t -> (string * string) list
163
(** [get_all_data elem] extracts all data-* attributes. *)
164
165
+
(** {1 Space-Separated List Accessors}
166
+
167
+
These functions return attribute values as parsed lists, splitting on
168
+
whitespace per HTML5 spec. *)
169
+
170
+
val get_class_list : t -> string list
171
+
(** [get_class_list elem] returns class names as a list. *)
172
+
173
+
val get_rel_list : t -> string list
174
+
(** [get_rel_list elem] returns link types as a list. *)
175
+
176
+
val get_headers_list : t -> string list
177
+
(** [get_headers_list elem] returns header IDs as a list (for td/th). *)
178
+
179
+
val get_itemref_list : t -> string list
180
+
(** [get_itemref_list elem] returns itemref IDs as a list. *)
181
+
182
+
val get_itemprop_list : t -> string list
183
+
(** [get_itemprop_list elem] returns itemprop names as a list. *)
184
+
185
+
val get_itemtype_list : t -> string list
186
+
(** [get_itemtype_list elem] returns itemtype URLs as a list. *)
187
+
188
+
val get_aria_list : string -> t -> string list
189
+
(** [get_aria_list name elem] returns space-separated ARIA values as a list. *)
190
+
191
(** {1 Raw Attribute Fallback} *)
192
193
val get_raw_attr : string -> t -> string option
+15
-4
lib/htmlrw_check/message_collector.ml
+15
-4
lib/htmlrw_check/message_collector.ml
···
3
type t = {
4
mutable messages : Message.t list;
5
mutable current_location : Message.location option;
6
}
7
8
-
let create () = { messages = []; current_location = None }
9
10
let set_current_location t location = t.current_location <- location
11
let clear_current_location t = t.current_location <- None
12
let get_current_location t = t.current_location
13
14
-
let add t msg = t.messages <- msg :: t.messages
15
16
(** Add a message from a typed conformance error code *)
17
let add_typed t ?location ?element ?attribute ?extract error_code =
···
23
let msg = Message.of_conformance_error ?location:loc ?element ?attribute ?extract error_code in
24
add t msg
25
26
-
let messages t = List.rev t.messages
27
28
let errors t =
29
List.filter (fun msg -> msg.Message.severity = Message.Error) (messages t)
···
45
if msg.Message.severity = Message.Error then acc + 1 else acc)
46
0 t.messages
47
48
-
let clear t = t.messages <- []
···
3
type t = {
4
mutable messages : Message.t list;
5
mutable current_location : Message.location option;
6
+
mutable cached_reversed : Message.t list option; (* Cache for O(1) repeated access *)
7
}
8
9
+
let create () = { messages = []; current_location = None; cached_reversed = None }
10
11
let set_current_location t location = t.current_location <- location
12
let clear_current_location t = t.current_location <- None
13
let get_current_location t = t.current_location
14
15
+
let add t msg =
16
+
t.messages <- msg :: t.messages;
17
+
t.cached_reversed <- None (* Invalidate cache *)
18
19
(** Add a message from a typed conformance error code *)
20
let add_typed t ?location ?element ?attribute ?extract error_code =
···
26
let msg = Message.of_conformance_error ?location:loc ?element ?attribute ?extract error_code in
27
add t msg
28
29
+
let messages t =
30
+
match t.cached_reversed with
31
+
| Some cached -> cached
32
+
| None ->
33
+
let reversed = List.rev t.messages in
34
+
t.cached_reversed <- Some reversed;
35
+
reversed
36
37
let errors t =
38
List.filter (fun msg -> msg.Message.severity = Message.Error) (messages t)
···
54
if msg.Message.severity = Message.Error then acc + 1 else acc)
55
0 t.messages
56
57
+
let clear t =
58
+
t.messages <- [];
59
+
t.cached_reversed <- None
+4
-8
lib/htmlrw_check/semantic/id_checker.ml
+4
-8
lib/htmlrw_check/semantic/id_checker.ml
···
55
56
(** Attributes that reference a single ID - O(1) lookup. *)
57
let single_id_ref_attrs =
58
-
let tbl = Hashtbl.create 8 in
59
-
List.iter (fun a -> Hashtbl.add tbl a ()) [
60
"for"; (* label *)
61
"form"; (* form-associated elements *)
62
"list"; (* input *)
···
64
"popovertarget"; (* button - references popover element *)
65
"commandfor"; (* button - references element to control *)
66
"anchor"; (* popover - references anchor element *)
67
-
];
68
-
tbl
69
70
let is_single_id_ref_attr name = Hashtbl.mem single_id_ref_attrs name
71
72
(** Attributes that reference multiple IDs (space-separated) - O(1) lookup. *)
73
let multi_id_ref_attrs =
74
-
let tbl = Hashtbl.create 8 in
75
-
List.iter (fun a -> Hashtbl.add tbl a ()) [
76
"headers"; (* td, th *)
77
"aria-labelledby";
78
"aria-describedby";
···
80
"aria-flowto";
81
"aria-owns";
82
"itemref";
83
-
];
84
-
tbl
85
86
let is_multi_id_ref_attr name = Hashtbl.mem multi_id_ref_attrs name
87
···
55
56
(** Attributes that reference a single ID - O(1) lookup. *)
57
let single_id_ref_attrs =
58
+
Attr_utils.hashtbl_of_list [
59
"for"; (* label *)
60
"form"; (* form-associated elements *)
61
"list"; (* input *)
···
63
"popovertarget"; (* button - references popover element *)
64
"commandfor"; (* button - references element to control *)
65
"anchor"; (* popover - references anchor element *)
66
+
]
67
68
let is_single_id_ref_attr name = Hashtbl.mem single_id_ref_attrs name
69
70
(** Attributes that reference multiple IDs (space-separated) - O(1) lookup. *)
71
let multi_id_ref_attrs =
72
+
Attr_utils.hashtbl_of_list [
73
"headers"; (* td, th *)
74
"aria-labelledby";
75
"aria-describedby";
···
77
"aria-flowto";
78
"aria-owns";
79
"itemref";
80
+
]
81
82
let is_multi_id_ref_attr name = Hashtbl.mem multi_id_ref_attrs name
83
+3
-7
lib/htmlrw_check/semantic/lang_detecting_checker.ml
+3
-7
lib/htmlrw_check/semantic/lang_detecting_checker.ml
···
18
19
(* Elements whose text content we skip for language detection - O(1) lookup *)
20
let skip_elements =
21
-
let tbl = Hashtbl.create 20 in
22
-
List.iter (fun e -> Hashtbl.add tbl e ()) [
23
"a"; "button"; "details"; "figcaption"; "form"; "li"; "nav";
24
"pre"; "script"; "select"; "span"; "style"; "summary";
25
"td"; "textarea"; "th"; "tr"
26
-
];
27
-
tbl
28
29
let is_skip_element name = Hashtbl.mem skip_elements name
30
31
(* RTL languages - O(1) lookup *)
32
let rtl_langs =
33
-
let tbl = Hashtbl.create 16 in
34
-
List.iter (fun l -> Hashtbl.add tbl l ()) ["ar"; "azb"; "ckb"; "dv"; "fa"; "he"; "pnb"; "ps"; "sd"; "ug"; "ur"; "iw"];
35
-
tbl
36
37
let is_rtl_lang lang = Hashtbl.mem rtl_langs lang
38
···
18
19
(* Elements whose text content we skip for language detection - O(1) lookup *)
20
let skip_elements =
21
+
Attr_utils.hashtbl_of_list [
22
"a"; "button"; "details"; "figcaption"; "form"; "li"; "nav";
23
"pre"; "script"; "select"; "span"; "style"; "summary";
24
"td"; "textarea"; "th"; "tr"
25
+
]
26
27
let is_skip_element name = Hashtbl.mem skip_elements name
28
29
(* RTL languages - O(1) lookup *)
30
let rtl_langs =
31
+
Attr_utils.hashtbl_of_list ["ar"; "azb"; "ckb"; "dv"; "fa"; "he"; "pnb"; "ps"; "sd"; "ug"; "ur"; "iw"]
32
33
let is_rtl_lang lang = Hashtbl.mem rtl_langs lang
34
+6
-3
lib/htmlrw_check/semantic/nesting_checker.ml
+6
-3
lib/htmlrw_check/semantic/nesting_checker.ml
···
155
let map_num = special_ancestor_number "map" in
156
1 lsl map_num
157
158
-
(** Transparent elements - inherit content model from parent *)
159
-
let transparent_elements = ["a"; "canvas"; "video"; "audio"; "object"; "ins"; "del"; "map"]
160
161
(** Stack node representing an element's context. *)
162
type stack_node = {
···
334
in
335
336
(* Push onto stack *)
337
-
let is_transparent = List.mem name transparent_elements in
338
let node = { ancestor_mask = state.ancestor_mask; name; is_transparent } in
339
state.stack <- node :: state.stack;
340
state.ancestor_mask <- new_mask
···
155
let map_num = special_ancestor_number "map" in
156
1 lsl map_num
157
158
+
(** Transparent elements - inherit content model from parent. O(1) hashtable lookup. *)
159
+
let transparent_elements_tbl =
160
+
Attr_utils.hashtbl_of_list ["a"; "canvas"; "video"; "audio"; "object"; "ins"; "del"; "map"]
161
+
162
+
let is_transparent_element name = Hashtbl.mem transparent_elements_tbl name
163
164
(** Stack node representing an element's context. *)
165
type stack_node = {
···
337
in
338
339
(* Push onto stack *)
340
+
let is_transparent = is_transparent_element name in
341
let node = { ancestor_mask = state.ancestor_mask; name; is_transparent } in
342
state.stack <- node :: state.stack;
343
state.ancestor_mask <- new_mask
+14
-19
lib/htmlrw_check/specialized/aria_checker.ml
+14
-19
lib/htmlrw_check/specialized/aria_checker.ml
···
9
specification. Abstract roles are included but should not be used
10
in HTML content. *)
11
let valid_aria_roles =
12
-
let roles = [
13
(* Document structure roles *)
14
(* Note: "directory" is deprecated in WAI-ARIA 1.2, use "list" instead *)
15
"article"; "associationlist"; "associationlistitemkey";
···
43
44
(* Additional roles *)
45
"application"; "columnheader"; "rowheader";
46
-
] in
47
-
let tbl = Hashtbl.create (List.length roles) in
48
-
List.iter (fun role -> Hashtbl.add tbl role ()) roles;
49
-
tbl
50
51
(** Roles that cannot have accessible names.
52
53
These roles must not have aria-label or aria-labelledby attributes. *)
54
let roles_which_cannot_be_named =
55
-
let roles = [
56
"caption"; "code"; "deletion"; "emphasis"; "generic"; "insertion";
57
"mark"; "none"; "paragraph"; "presentation"; "strong"; "subscript";
58
"suggestion"; "superscript"
59
-
] in
60
-
let tbl = Hashtbl.create (List.length roles) in
61
-
List.iter (fun role -> Hashtbl.add tbl role ()) roles;
62
-
tbl
63
64
(** Elements whose implicit role is 'generic' and cannot have aria-label unless
65
-
they have an explicit role that allows naming. *)
66
-
let elements_with_generic_role = [
67
-
"a"; "abbr"; "address"; "b"; "bdi"; "bdo"; "br"; "caption"; "cite"; "code";
68
-
"colgroup"; "data"; "del"; "dfn"; "div"; "em"; "figcaption"; "hgroup"; "i";
69
-
"ins"; "kbd"; "legend"; "mark"; "p"; "pre"; "q"; "rp"; "rt"; "ruby"; "s";
70
-
"samp"; "small"; "span"; "strong"; "sub"; "sup"; "time"; "title"; "u"; "var";
71
-
"wbr"
72
-
]
73
74
(** Check if element name is a custom element (contains hyphen). *)
75
let is_custom_element name =
···
90
if is_custom_element element_name then false
91
else
92
(* No implicit role - element has generic role unless it's interactive *)
93
-
not (List.mem element_name elements_with_generic_role)
94
95
(** Map from descendant role to set of required ancestor roles. *)
96
let required_role_ancestor_by_descendant : (string, string list) Hashtbl.t =
···
9
specification. Abstract roles are included but should not be used
10
in HTML content. *)
11
let valid_aria_roles =
12
+
Attr_utils.hashtbl_of_list [
13
(* Document structure roles *)
14
(* Note: "directory" is deprecated in WAI-ARIA 1.2, use "list" instead *)
15
"article"; "associationlist"; "associationlistitemkey";
···
43
44
(* Additional roles *)
45
"application"; "columnheader"; "rowheader";
46
+
]
47
48
(** Roles that cannot have accessible names.
49
50
These roles must not have aria-label or aria-labelledby attributes. *)
51
let roles_which_cannot_be_named =
52
+
Attr_utils.hashtbl_of_list [
53
"caption"; "code"; "deletion"; "emphasis"; "generic"; "insertion";
54
"mark"; "none"; "paragraph"; "presentation"; "strong"; "subscript";
55
"suggestion"; "superscript"
56
+
]
57
58
(** Elements whose implicit role is 'generic' and cannot have aria-label unless
59
+
they have an explicit role that allows naming. O(1) lookup. *)
60
+
let elements_with_generic_role =
61
+
Attr_utils.hashtbl_of_list [
62
+
"a"; "abbr"; "address"; "b"; "bdi"; "bdo"; "br"; "caption"; "cite"; "code";
63
+
"colgroup"; "data"; "del"; "dfn"; "div"; "em"; "figcaption"; "hgroup"; "i";
64
+
"ins"; "kbd"; "legend"; "mark"; "p"; "pre"; "q"; "rp"; "rt"; "ruby"; "s";
65
+
"samp"; "small"; "span"; "strong"; "sub"; "sup"; "time"; "title"; "u"; "var";
66
+
"wbr"
67
+
]
68
69
(** Check if element name is a custom element (contains hyphen). *)
70
let is_custom_element name =
···
85
if is_custom_element element_name then false
86
else
87
(* No implicit role - element has generic role unless it's interactive *)
88
+
not (Hashtbl.mem elements_with_generic_role element_name)
89
90
(** Map from descendant role to set of required ancestor roles. *)
91
let required_role_ancestor_by_descendant : (string, string list) Hashtbl.t =
+2
-9
lib/htmlrw_check/specialized/attr_restrictions_checker.ml
+2
-9
lib/htmlrw_check/specialized/attr_restrictions_checker.ml
···
135
if name_lower = "link" then begin
136
let has_imagesizes = Attr_utils.has_attr "imagesizes" attrs in
137
let has_imagesrcset = Attr_utils.has_attr "imagesrcset" attrs in
138
-
let rel_value = Attr_utils.get_attr "rel" attrs in
139
let as_value = Attr_utils.get_attr "as" attrs in
140
141
(* imagesizes requires imagesrcset *)
···
155
(* as attribute requires rel="preload" or rel="modulepreload" *)
156
(match as_value with
157
| Some _ ->
158
-
let rel_is_preload = match rel_value with
159
-
| Some v ->
160
-
let rel_lower = String.lowercase_ascii (String.trim v) in
161
-
String.length rel_lower > 0 &&
162
-
(List.mem "preload" (String.split_on_char ' ' rel_lower) ||
163
-
List.mem "modulepreload" (String.split_on_char ' ' rel_lower))
164
-
| None -> false
165
-
in
166
if not rel_is_preload then
167
Message_collector.add_typed collector (`Link `As_requires_preload)
168
| None -> ())
···
135
if name_lower = "link" then begin
136
let has_imagesizes = Attr_utils.has_attr "imagesizes" attrs in
137
let has_imagesrcset = Attr_utils.has_attr "imagesrcset" attrs in
138
let as_value = Attr_utils.get_attr "as" attrs in
139
140
(* imagesizes requires imagesrcset *)
···
154
(* as attribute requires rel="preload" or rel="modulepreload" *)
155
(match as_value with
156
| Some _ ->
157
+
let rel_types = Element.get_rel_list element in
158
+
let rel_is_preload = List.mem "preload" rel_types || List.mem "modulepreload" rel_types in
159
if not rel_is_preload then
160
Message_collector.add_typed collector (`Link `As_requires_preload)
161
| None -> ())
+1
-3
lib/htmlrw_check/specialized/label_checker.ml
+1
-3
lib/htmlrw_check/specialized/label_checker.ml
···
4
5
(** Labelable elements that label can reference - O(1) hashtable lookup *)
6
let labelable_elements =
7
-
let tbl = Hashtbl.create 8 in
8
-
List.iter (fun e -> Hashtbl.add tbl e ()) ["button"; "input"; "meter"; "output"; "progress"; "select"; "textarea"];
9
-
tbl
10
11
let is_labelable name = Hashtbl.mem labelable_elements name
12