OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* Simple benchmarks for HTML5 parsing and validation *)
2
3let time_it name iterations f =
4 Gc.full_major ();
5 let start = Unix.gettimeofday () in
6 for _ = 1 to iterations do
7 ignore (f ())
8 done;
9 let elapsed = Unix.gettimeofday () -. start in
10 let per_iter = elapsed /. float_of_int iterations *. 1000.0 in
11 Printf.printf "%-40s %d iters in %.3fs (%.3f ms/iter)\n%!" name iterations elapsed per_iter
12
13(* Generate HTML with nested elements *)
14let generate_nested_html depth =
15 let buf = Buffer.create 4096 in
16 Buffer.add_string buf "<!DOCTYPE html><html><head><title>Test</title></head><body>";
17 for i = 1 to depth do
18 Buffer.add_string buf (Printf.sprintf "<div id=\"d%d\" class=\"c1 c2 c3\">" i)
19 done;
20 Buffer.add_string buf "<p>Content</p>";
21 for _ = 1 to depth do
22 Buffer.add_string buf "</div>"
23 done;
24 Buffer.add_string buf "</body></html>";
25 Buffer.contents buf
26
27(* Generate HTML with many sibling elements *)
28let generate_wide_html count =
29 let buf = Buffer.create (count * 100) in
30 Buffer.add_string buf "<!DOCTYPE html><html><head><title>Test</title></head><body><div>";
31 for i = 1 to count do
32 Buffer.add_string buf (Printf.sprintf "<span id=\"s%d\" class=\"cls\" data-value=\"%d\">Item %d</span>" i i i)
33 done;
34 Buffer.add_string buf "</div></body></html>";
35 Buffer.contents buf
36
37(* Generate HTML with table *)
38let generate_table_html rows cols =
39 let buf = Buffer.create (rows * cols * 50) in
40 Buffer.add_string buf "<!DOCTYPE html><html><head><title>Table</title></head><body><table>";
41 Buffer.add_string buf "<thead><tr>";
42 for c = 1 to cols do
43 Buffer.add_string buf (Printf.sprintf "<th>Col %d</th>" c)
44 done;
45 Buffer.add_string buf "</tr></thead><tbody>";
46 for r = 1 to rows do
47 Buffer.add_string buf "<tr>";
48 for c = 1 to cols do
49 Buffer.add_string buf (Printf.sprintf "<td>R%dC%d</td>" r c)
50 done;
51 Buffer.add_string buf "</tr>"
52 done;
53 Buffer.add_string buf "</tbody></table></body></html>";
54 Buffer.contents buf
55
56(* Generate HTML with interactive elements (tests nesting checker) *)
57let generate_interactive_html count =
58 let buf = Buffer.create (count * 200) in
59 Buffer.add_string buf "<!DOCTYPE html><html><head><title>Forms</title></head><body>";
60 for i = 1 to count do
61 Buffer.add_string buf (Printf.sprintf
62 "<form id=\"f%d\"><label for=\"i%d\">Label</label><input type=\"text\" id=\"i%d\" name=\"n%d\"><button type=\"submit\">Submit</button></form>" i i i i)
63 done;
64 Buffer.add_string buf "</body></html>";
65 Buffer.contents buf
66
67(* Generate HTML with validation errors *)
68let generate_invalid_html count =
69 let buf = Buffer.create (count * 100) in
70 Buffer.add_string buf "<!DOCTYPE html><html><head><title>Invalid</title></head><body>";
71 for i = 1 to count do
72 (* Various validation errors *)
73 Buffer.add_string buf (Printf.sprintf "<div role=\"invalid%d\">" i);
74 Buffer.add_string buf "<img>"; (* missing alt *)
75 Buffer.add_string buf "<a href=\"#\"><button>Nested interactive</button></a>";
76 Buffer.add_string buf "</div>"
77 done;
78 Buffer.add_string buf "</body></html>";
79 Buffer.contents buf
80
81(* Benchmark parsing only *)
82let bench_parse html iterations =
83 let bytes = Bytes.of_string html in
84 time_it "Parse only" iterations (fun () ->
85 Html5rw.parse_bytes bytes
86 )
87
88(* Benchmark validation only (on pre-parsed DOM) *)
89let bench_validate doc iterations =
90 time_it "Validate only" iterations (fun () ->
91 Htmlrw_check.check_parsed doc
92 )
93
94(* Benchmark parse + validate *)
95let bench_parse_and_validate html iterations =
96 time_it "Parse + Validate" iterations (fun () ->
97 Htmlrw_check.check_string html
98 )
99
100(* Benchmark repeated message access (tests caching via check result) *)
101let bench_message_access result iterations =
102 time_it "Message access (10x errors/warnings)" iterations (fun () ->
103 for _ = 1 to 10 do
104 ignore (Htmlrw_check.errors result);
105 ignore (Htmlrw_check.warnings result);
106 ignore (Htmlrw_check.messages result)
107 done
108 )
109
110(* Micro-benchmarks for specific optimizations *)
111
112(* Test List.mem vs Hashtbl.mem for table element lookups *)
113let bench_list_mem iterations =
114 let elements = ["table"; "tbody"; "tfoot"; "thead"; "tr"] in
115 let test_names = ["table"; "div"; "tbody"; "span"; "tr"; "p"; "thead"; "a"] in
116 time_it "List.mem (5 elements, 8 lookups)" iterations (fun () ->
117 List.iter (fun name ->
118 ignore (List.mem name elements)
119 ) test_names
120 )
121
122let bench_hashtbl_mem iterations =
123 let tbl = Hashtbl.create 8 in
124 List.iter (fun e -> Hashtbl.add tbl e ()) ["table"; "tbody"; "tfoot"; "thead"; "tr"];
125 let test_names = ["table"; "div"; "tbody"; "span"; "tr"; "p"; "thead"; "a"] in
126 time_it "Hashtbl.mem (5 elements, 8 lookups)" iterations (fun () ->
127 List.iter (fun name ->
128 ignore (Hashtbl.mem tbl name)
129 ) test_names
130 )
131
132(* Test noncharacter validation - bitwise vs List.mem *)
133let bench_nonchar_list iterations =
134 let nonchars = [0xFFFE; 0xFFFF; 0x1FFFE; 0x1FFFF; 0x2FFFE; 0x2FFFF;
135 0x3FFFE; 0x3FFFF; 0x4FFFE; 0x4FFFF; 0x5FFFE; 0x5FFFF;
136 0x6FFFE; 0x6FFFF; 0x7FFFE; 0x7FFFF; 0x8FFFE; 0x8FFFF;
137 0x9FFFE; 0x9FFFF; 0xAFFFE; 0xAFFFF; 0xBFFFE; 0xBFFFF;
138 0xCFFFE; 0xCFFFF; 0xDFFFE; 0xDFFFF; 0xEFFFE; 0xEFFFF;
139 0xFFFFE; 0xFFFFF; 0x10FFFE; 0x10FFFF] in
140 let test_codes = [0x41; 0xFFFE; 0x1000; 0x10FFFF; 0xFDD0; 0x3FFFE; 0x100; 0xFFFF] in
141 time_it "List.mem nonchar (34 elements, 8 checks)" iterations (fun () ->
142 List.iter (fun code ->
143 ignore (List.mem code nonchars)
144 ) test_codes
145 )
146
147let bench_nonchar_bitwise iterations =
148 let test_codes = [0x41; 0xFFFE; 0x1000; 0x10FFFF; 0xFDD0; 0x3FFFE; 0x100; 0xFFFF] in
149 time_it "Bitwise nonchar (8 checks)" iterations (fun () ->
150 List.iter (fun code ->
151 let low16 = code land 0xFFFF in
152 ignore (low16 = 0xFFFE || low16 = 0xFFFF)
153 ) test_codes
154 )
155
156(* Test message list reversal - once vs cached *)
157let bench_list_rev iterations =
158 let msgs = List.init 100 (fun i -> Printf.sprintf "Message %d" i) in
159 time_it "List.rev (100 msgs, 10 accesses)" iterations (fun () ->
160 for _ = 1 to 10 do
161 ignore (List.rev msgs)
162 done
163 )
164
165let bench_cached_rev iterations =
166 let msgs = List.init 100 (fun i -> Printf.sprintf "Message %d" i) in
167 let cached = ref None in
168 time_it "Cached rev (100 msgs, 10 accesses)" iterations (fun () ->
169 cached := None; (* Reset cache each iteration *)
170 for _ = 1 to 10 do
171 match !cached with
172 | Some c -> ignore c
173 | None ->
174 let rev = List.rev msgs in
175 cached := Some rev;
176 ignore rev
177 done
178 )
179
180let () =
181 Printf.printf "\n=== HTML5 Parser/Validator Benchmarks ===\n\n";
182
183 (* Micro-benchmarks first *)
184 Printf.printf "--- Micro-benchmarks: Optimized Code Paths ---\n";
185 Printf.printf "\nTable element lookup (List.mem vs Hashtbl.mem):\n";
186 bench_list_mem 100000;
187 bench_hashtbl_mem 100000;
188
189 Printf.printf "\nNoncharacter validation (List.mem vs bitwise):\n";
190 bench_nonchar_list 100000;
191 bench_nonchar_bitwise 100000;
192
193 Printf.printf "\nMessage list reversal (repeated vs cached):\n";
194 bench_list_rev 10000;
195 bench_cached_rev 10000;
196 Printf.printf "\n";
197
198 (* Small documents *)
199 Printf.printf "--- Small Document (100 nested divs) ---\n";
200 let small_nested = generate_nested_html 100 in
201 Printf.printf "Document size: %d bytes\n" (String.length small_nested);
202 bench_parse small_nested 1000;
203 let doc = Html5rw.parse_bytes (Bytes.of_string small_nested) in
204 bench_validate doc 1000;
205 bench_parse_and_validate small_nested 1000;
206 Printf.printf "\n";
207
208 (* Wide documents *)
209 Printf.printf "--- Wide Document (1000 siblings) ---\n";
210 let wide = generate_wide_html 1000 in
211 Printf.printf "Document size: %d bytes\n" (String.length wide);
212 bench_parse wide 100;
213 let doc = Html5rw.parse_bytes (Bytes.of_string wide) in
214 bench_validate doc 100;
215 bench_parse_and_validate wide 100;
216 Printf.printf "\n";
217
218 (* Table documents *)
219 Printf.printf "--- Table Document (100x20) ---\n";
220 let table = generate_table_html 100 20 in
221 Printf.printf "Document size: %d bytes\n" (String.length table);
222 bench_parse table 100;
223 let doc = Html5rw.parse_bytes (Bytes.of_string table) in
224 bench_validate doc 100;
225 bench_parse_and_validate table 100;
226 Printf.printf "\n";
227
228 (* Interactive elements (nesting checker stress test) *)
229 Printf.printf "--- Interactive Elements (200 forms) ---\n";
230 let interactive = generate_interactive_html 200 in
231 Printf.printf "Document size: %d bytes\n" (String.length interactive);
232 bench_parse interactive 100;
233 let doc = Html5rw.parse_bytes (Bytes.of_string interactive) in
234 bench_validate doc 100;
235 bench_parse_and_validate interactive 100;
236 Printf.printf "\n";
237
238 (* Invalid HTML (validation error generation) *)
239 Printf.printf "--- Invalid HTML (100 error patterns) ---\n";
240 let invalid = generate_invalid_html 100 in
241 Printf.printf "Document size: %d bytes\n" (String.length invalid);
242 bench_parse invalid 100;
243 let doc = Html5rw.parse_bytes (Bytes.of_string invalid) in
244 bench_validate doc 100;
245 bench_parse_and_validate invalid 100;
246 Printf.printf "\n";
247
248 (* Message access benchmark - tests caching *)
249 Printf.printf "--- Message Access (invalid HTML result, 10 accesses) ---\n";
250 let result = Htmlrw_check.check_string invalid in
251 bench_message_access result 1000;
252 Printf.printf "\n";
253
254 (* Large document *)
255 Printf.printf "--- Large Document (500 nested divs) ---\n";
256 let large_nested = generate_nested_html 500 in
257 Printf.printf "Document size: %d bytes\n" (String.length large_nested);
258 bench_parse large_nested 100;
259 let doc = Html5rw.parse_bytes (Bytes.of_string large_nested) in
260 bench_validate doc 100;
261 bench_parse_and_validate large_nested 100;
262 Printf.printf "\n";
263
264 Printf.printf "=== Benchmarks Complete ===\n"