OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 9.9 kB view raw
1(* Simple benchmarks for HTML5 parsing and validation *) 2 3let time_it name iterations f = 4 Gc.full_major (); 5 let start = Unix.gettimeofday () in 6 for _ = 1 to iterations do 7 ignore (f ()) 8 done; 9 let elapsed = Unix.gettimeofday () -. start in 10 let per_iter = elapsed /. float_of_int iterations *. 1000.0 in 11 Printf.printf "%-40s %d iters in %.3fs (%.3f ms/iter)\n%!" name iterations elapsed per_iter 12 13(* Generate HTML with nested elements *) 14let generate_nested_html depth = 15 let buf = Buffer.create 4096 in 16 Buffer.add_string buf "<!DOCTYPE html><html><head><title>Test</title></head><body>"; 17 for i = 1 to depth do 18 Buffer.add_string buf (Printf.sprintf "<div id=\"d%d\" class=\"c1 c2 c3\">" i) 19 done; 20 Buffer.add_string buf "<p>Content</p>"; 21 for _ = 1 to depth do 22 Buffer.add_string buf "</div>" 23 done; 24 Buffer.add_string buf "</body></html>"; 25 Buffer.contents buf 26 27(* Generate HTML with many sibling elements *) 28let generate_wide_html count = 29 let buf = Buffer.create (count * 100) in 30 Buffer.add_string buf "<!DOCTYPE html><html><head><title>Test</title></head><body><div>"; 31 for i = 1 to count do 32 Buffer.add_string buf (Printf.sprintf "<span id=\"s%d\" class=\"cls\" data-value=\"%d\">Item %d</span>" i i i) 33 done; 34 Buffer.add_string buf "</div></body></html>"; 35 Buffer.contents buf 36 37(* Generate HTML with table *) 38let generate_table_html rows cols = 39 let buf = Buffer.create (rows * cols * 50) in 40 Buffer.add_string buf "<!DOCTYPE html><html><head><title>Table</title></head><body><table>"; 41 Buffer.add_string buf "<thead><tr>"; 42 for c = 1 to cols do 43 Buffer.add_string buf (Printf.sprintf "<th>Col %d</th>" c) 44 done; 45 Buffer.add_string buf "</tr></thead><tbody>"; 46 for r = 1 to rows do 47 Buffer.add_string buf "<tr>"; 48 for c = 1 to cols do 49 Buffer.add_string buf (Printf.sprintf "<td>R%dC%d</td>" r c) 50 done; 51 Buffer.add_string buf "</tr>" 52 done; 53 Buffer.add_string buf "</tbody></table></body></html>"; 54 Buffer.contents buf 55 56(* Generate HTML with interactive elements (tests nesting checker) *) 57let generate_interactive_html count = 58 let buf = Buffer.create (count * 200) in 59 Buffer.add_string buf "<!DOCTYPE html><html><head><title>Forms</title></head><body>"; 60 for i = 1 to count do 61 Buffer.add_string buf (Printf.sprintf 62 "<form id=\"f%d\"><label for=\"i%d\">Label</label><input type=\"text\" id=\"i%d\" name=\"n%d\"><button type=\"submit\">Submit</button></form>" i i i i) 63 done; 64 Buffer.add_string buf "</body></html>"; 65 Buffer.contents buf 66 67(* Generate HTML with validation errors *) 68let generate_invalid_html count = 69 let buf = Buffer.create (count * 100) in 70 Buffer.add_string buf "<!DOCTYPE html><html><head><title>Invalid</title></head><body>"; 71 for i = 1 to count do 72 (* Various validation errors *) 73 Buffer.add_string buf (Printf.sprintf "<div role=\"invalid%d\">" i); 74 Buffer.add_string buf "<img>"; (* missing alt *) 75 Buffer.add_string buf "<a href=\"#\"><button>Nested interactive</button></a>"; 76 Buffer.add_string buf "</div>" 77 done; 78 Buffer.add_string buf "</body></html>"; 79 Buffer.contents buf 80 81(* Benchmark parsing only *) 82let bench_parse html iterations = 83 let bytes = Bytes.of_string html in 84 time_it "Parse only" iterations (fun () -> 85 Html5rw.parse_bytes bytes 86 ) 87 88(* Benchmark validation only (on pre-parsed DOM) *) 89let bench_validate doc iterations = 90 time_it "Validate only" iterations (fun () -> 91 Htmlrw_check.check_parsed doc 92 ) 93 94(* Benchmark parse + validate *) 95let bench_parse_and_validate html iterations = 96 time_it "Parse + Validate" iterations (fun () -> 97 Htmlrw_check.check_string html 98 ) 99 100(* Benchmark repeated message access (tests caching via check result) *) 101let bench_message_access result iterations = 102 time_it "Message access (10x errors/warnings)" iterations (fun () -> 103 for _ = 1 to 10 do 104 ignore (Htmlrw_check.errors result); 105 ignore (Htmlrw_check.warnings result); 106 ignore (Htmlrw_check.messages result) 107 done 108 ) 109 110(* Micro-benchmarks for specific optimizations *) 111 112(* Test List.mem vs Hashtbl.mem for table element lookups *) 113let bench_list_mem iterations = 114 let elements = ["table"; "tbody"; "tfoot"; "thead"; "tr"] in 115 let test_names = ["table"; "div"; "tbody"; "span"; "tr"; "p"; "thead"; "a"] in 116 time_it "List.mem (5 elements, 8 lookups)" iterations (fun () -> 117 List.iter (fun name -> 118 ignore (List.mem name elements) 119 ) test_names 120 ) 121 122let bench_hashtbl_mem iterations = 123 let tbl = Hashtbl.create 8 in 124 List.iter (fun e -> Hashtbl.add tbl e ()) ["table"; "tbody"; "tfoot"; "thead"; "tr"]; 125 let test_names = ["table"; "div"; "tbody"; "span"; "tr"; "p"; "thead"; "a"] in 126 time_it "Hashtbl.mem (5 elements, 8 lookups)" iterations (fun () -> 127 List.iter (fun name -> 128 ignore (Hashtbl.mem tbl name) 129 ) test_names 130 ) 131 132(* Test noncharacter validation - bitwise vs List.mem *) 133let bench_nonchar_list iterations = 134 let nonchars = [0xFFFE; 0xFFFF; 0x1FFFE; 0x1FFFF; 0x2FFFE; 0x2FFFF; 135 0x3FFFE; 0x3FFFF; 0x4FFFE; 0x4FFFF; 0x5FFFE; 0x5FFFF; 136 0x6FFFE; 0x6FFFF; 0x7FFFE; 0x7FFFF; 0x8FFFE; 0x8FFFF; 137 0x9FFFE; 0x9FFFF; 0xAFFFE; 0xAFFFF; 0xBFFFE; 0xBFFFF; 138 0xCFFFE; 0xCFFFF; 0xDFFFE; 0xDFFFF; 0xEFFFE; 0xEFFFF; 139 0xFFFFE; 0xFFFFF; 0x10FFFE; 0x10FFFF] in 140 let test_codes = [0x41; 0xFFFE; 0x1000; 0x10FFFF; 0xFDD0; 0x3FFFE; 0x100; 0xFFFF] in 141 time_it "List.mem nonchar (34 elements, 8 checks)" iterations (fun () -> 142 List.iter (fun code -> 143 ignore (List.mem code nonchars) 144 ) test_codes 145 ) 146 147let bench_nonchar_bitwise iterations = 148 let test_codes = [0x41; 0xFFFE; 0x1000; 0x10FFFF; 0xFDD0; 0x3FFFE; 0x100; 0xFFFF] in 149 time_it "Bitwise nonchar (8 checks)" iterations (fun () -> 150 List.iter (fun code -> 151 let low16 = code land 0xFFFF in 152 ignore (low16 = 0xFFFE || low16 = 0xFFFF) 153 ) test_codes 154 ) 155 156(* Test message list reversal - once vs cached *) 157let bench_list_rev iterations = 158 let msgs = List.init 100 (fun i -> Printf.sprintf "Message %d" i) in 159 time_it "List.rev (100 msgs, 10 accesses)" iterations (fun () -> 160 for _ = 1 to 10 do 161 ignore (List.rev msgs) 162 done 163 ) 164 165let bench_cached_rev iterations = 166 let msgs = List.init 100 (fun i -> Printf.sprintf "Message %d" i) in 167 let cached = ref None in 168 time_it "Cached rev (100 msgs, 10 accesses)" iterations (fun () -> 169 cached := None; (* Reset cache each iteration *) 170 for _ = 1 to 10 do 171 match !cached with 172 | Some c -> ignore c 173 | None -> 174 let rev = List.rev msgs in 175 cached := Some rev; 176 ignore rev 177 done 178 ) 179 180let () = 181 Printf.printf "\n=== HTML5 Parser/Validator Benchmarks ===\n\n"; 182 183 (* Micro-benchmarks first *) 184 Printf.printf "--- Micro-benchmarks: Optimized Code Paths ---\n"; 185 Printf.printf "\nTable element lookup (List.mem vs Hashtbl.mem):\n"; 186 bench_list_mem 100000; 187 bench_hashtbl_mem 100000; 188 189 Printf.printf "\nNoncharacter validation (List.mem vs bitwise):\n"; 190 bench_nonchar_list 100000; 191 bench_nonchar_bitwise 100000; 192 193 Printf.printf "\nMessage list reversal (repeated vs cached):\n"; 194 bench_list_rev 10000; 195 bench_cached_rev 10000; 196 Printf.printf "\n"; 197 198 (* Small documents *) 199 Printf.printf "--- Small Document (100 nested divs) ---\n"; 200 let small_nested = generate_nested_html 100 in 201 Printf.printf "Document size: %d bytes\n" (String.length small_nested); 202 bench_parse small_nested 1000; 203 let doc = Html5rw.parse_bytes (Bytes.of_string small_nested) in 204 bench_validate doc 1000; 205 bench_parse_and_validate small_nested 1000; 206 Printf.printf "\n"; 207 208 (* Wide documents *) 209 Printf.printf "--- Wide Document (1000 siblings) ---\n"; 210 let wide = generate_wide_html 1000 in 211 Printf.printf "Document size: %d bytes\n" (String.length wide); 212 bench_parse wide 100; 213 let doc = Html5rw.parse_bytes (Bytes.of_string wide) in 214 bench_validate doc 100; 215 bench_parse_and_validate wide 100; 216 Printf.printf "\n"; 217 218 (* Table documents *) 219 Printf.printf "--- Table Document (100x20) ---\n"; 220 let table = generate_table_html 100 20 in 221 Printf.printf "Document size: %d bytes\n" (String.length table); 222 bench_parse table 100; 223 let doc = Html5rw.parse_bytes (Bytes.of_string table) in 224 bench_validate doc 100; 225 bench_parse_and_validate table 100; 226 Printf.printf "\n"; 227 228 (* Interactive elements (nesting checker stress test) *) 229 Printf.printf "--- Interactive Elements (200 forms) ---\n"; 230 let interactive = generate_interactive_html 200 in 231 Printf.printf "Document size: %d bytes\n" (String.length interactive); 232 bench_parse interactive 100; 233 let doc = Html5rw.parse_bytes (Bytes.of_string interactive) in 234 bench_validate doc 100; 235 bench_parse_and_validate interactive 100; 236 Printf.printf "\n"; 237 238 (* Invalid HTML (validation error generation) *) 239 Printf.printf "--- Invalid HTML (100 error patterns) ---\n"; 240 let invalid = generate_invalid_html 100 in 241 Printf.printf "Document size: %d bytes\n" (String.length invalid); 242 bench_parse invalid 100; 243 let doc = Html5rw.parse_bytes (Bytes.of_string invalid) in 244 bench_validate doc 100; 245 bench_parse_and_validate invalid 100; 246 Printf.printf "\n"; 247 248 (* Message access benchmark - tests caching *) 249 Printf.printf "--- Message Access (invalid HTML result, 10 accesses) ---\n"; 250 let result = Htmlrw_check.check_string invalid in 251 bench_message_access result 1000; 252 Printf.printf "\n"; 253 254 (* Large document *) 255 Printf.printf "--- Large Document (500 nested divs) ---\n"; 256 let large_nested = generate_nested_html 500 in 257 Printf.printf "Document size: %d bytes\n" (String.length large_nested); 258 bench_parse large_nested 100; 259 let doc = Html5rw.parse_bytes (Bytes.of_string large_nested) in 260 bench_validate doc 100; 261 bench_parse_and_validate large_nested 100; 262 Printf.printf "\n"; 263 264 Printf.printf "=== Benchmarks Complete ===\n"