OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* Comprehensive test runner for all html5rw tests 2 3 Generates a single standalone HTML report combining: 4 - HTML5lib tree-construction tests 5 - HTML5lib tokenizer tests 6 - HTML5lib encoding tests 7 - HTML5lib serializer tests 8 - Nu HTML Validator tests (both lenient and strict modes) 9 - Roundtrip tests 10*) 11 12module Report = Test_report 13 14(* ============================================================ *) 15(* Test Suite Summary Types *) 16(* ============================================================ *) 17 18type suite_summary = { 19 name : string; 20 description : string; [@warning "-69"] 21 passed : int; 22 failed : int; 23 files : Report.file_result list; 24 extra_info : (string * string) list; 25} 26 27(* ============================================================ *) 28(* HTML5lib Tests Runner *) 29(* ============================================================ *) 30 31module Html5lib_runner = struct 32 (* Delegate to test_all.ml implementation by running the tests inline *) 33 34 open Bytesrw 35 36 (* Tree Construction Tests *) 37 module TreeConstruction = struct 38 module Parser = Html5rw.Parser 39 module Dom = Html5rw.Dom 40 41 type test_case = { 42 input : string; 43 expected_tree : string; 44 expected_errors : string list; 45 script_on : bool; 46 fragment_context : string option; 47 raw_lines : string; 48 } 49 50 let parse_test_case lines = 51 let raw_lines = String.concat "\n" lines in 52 let rec parse acc = function 53 | [] -> acc 54 | line :: rest when String.length line > 0 && line.[0] = '#' -> 55 let section = String.trim line in 56 let content, remaining = collect_section rest in 57 parse ((section, content) :: acc) remaining 58 | _ :: rest -> parse acc rest 59 and collect_section lines = 60 let rec loop acc = function 61 | [] -> (List.rev acc, []) 62 | line :: rest when String.length line > 0 && line.[0] = '#' -> 63 (List.rev acc, line :: rest) 64 | line :: rest -> loop (line :: acc) rest 65 in 66 loop [] lines 67 in 68 let sections = parse [] lines in 69 let get_section name = 70 match List.assoc_opt name sections with 71 | Some lines -> String.concat "\n" lines 72 | None -> "" 73 in 74 let data = get_section "#data" in 75 let document = get_section "#document" in 76 let errors_text = get_section "#errors" in 77 let errors = 78 String.split_on_char '\n' errors_text 79 |> List.filter (fun s -> String.trim s <> "") 80 in 81 let script_on = List.mem_assoc "#script-on" sections in 82 let fragment = 83 if List.mem_assoc "#document-fragment" sections then 84 Some (get_section "#document-fragment" |> String.trim) 85 else None 86 in 87 { input = data; expected_tree = document; expected_errors = errors; 88 script_on; fragment_context = fragment; raw_lines } 89 90 let parse_dat_file content = 91 let lines = String.split_on_char '\n' content in 92 let rec split_tests current acc = function 93 | [] -> 94 if current = [] then List.rev acc 95 else List.rev (List.rev current :: acc) 96 | "" :: "#data" :: rest -> 97 let new_acc = if current = [] then acc else (List.rev current :: acc) in 98 split_tests ["#data"] new_acc rest 99 | line :: rest -> 100 split_tests (line :: current) acc rest 101 in 102 let test_groups = split_tests [] [] lines in 103 List.filter_map (fun lines -> 104 if List.exists (fun l -> l = "#data") lines then 105 Some (parse_test_case lines) 106 else None 107 ) test_groups 108 109 let strip_tree_prefix s = 110 let lines = String.split_on_char '\n' s in 111 let stripped = List.filter_map (fun line -> 112 if String.length line >= 2 && String.sub line 0 2 = "| " then 113 Some (String.sub line 2 (String.length line - 2)) 114 else if String.trim line = "" then None 115 else Some line 116 ) lines in 117 String.concat "\n" stripped 118 119 let normalize_tree s = 120 let lines = String.split_on_char '\n' s in 121 let non_empty = List.filter (fun l -> String.trim l <> "") lines in 122 String.concat "\n" non_empty 123 124 let run_test test = 125 try 126 let result = 127 match test.fragment_context with 128 | Some ctx_str -> 129 let (namespace, tag_name) = 130 match String.split_on_char ' ' ctx_str with 131 | [ns; tag] when ns = "svg" -> (Some "svg", tag) 132 | [ns; tag] when ns = "math" -> (Some "mathml", tag) 133 | [tag] -> (None, tag) 134 | _ -> (None, ctx_str) 135 in 136 let context = Html5rw.Parser.make_fragment_context ~tag_name ~namespace () in 137 let reader = Bytes.Reader.of_string test.input in 138 Html5rw.Parser.parse ~collect_errors:true ~fragment_context:context reader 139 | None -> 140 let reader = Bytes.Reader.of_string test.input in 141 Html5rw.Parser.parse ~collect_errors:true reader 142 in 143 let actual_tree = Html5rw.Dom.to_test_format (Html5rw.Parser.root result) in 144 let expected = normalize_tree (strip_tree_prefix test.expected_tree) in 145 let actual = normalize_tree (strip_tree_prefix actual_tree) in 146 let error_count = List.length (Html5rw.Parser.errors result) in 147 let expected_error_count = List.length test.expected_errors in 148 (expected = actual, expected, actual, error_count, expected_error_count) 149 with e -> 150 let expected = normalize_tree (strip_tree_prefix test.expected_tree) in 151 (false, expected, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), 0, 0) 152 153 let run_file path = 154 let ic = open_in path in 155 let content = really_input_string ic (in_channel_length ic) in 156 close_in ic; 157 let tests = parse_dat_file content in 158 let filename = Filename.basename path in 159 let passed = ref 0 in 160 let failed = ref 0 in 161 let results = ref [] in 162 List.iteri (fun i test -> 163 if test.script_on then () 164 else begin 165 let (success, expected, actual, actual_error_count, expected_error_count) = run_test test in 166 let description = 167 let input_preview = String.sub test.input 0 (min 60 (String.length test.input)) in 168 if test.fragment_context <> None then 169 Printf.sprintf "Fragment (%s): %s" (Option.get test.fragment_context) input_preview 170 else input_preview 171 in 172 let result : Report.test_result = { 173 test_num = i + 1; description; input = test.input; expected; actual; success; 174 details = [ 175 ("Fragment Context", Option.value test.fragment_context ~default:"(none)"); 176 ("Expected Errors", string_of_int expected_error_count); 177 ("Actual Errors", string_of_int actual_error_count); 178 ]; 179 raw_test_data = Some test.raw_lines; 180 } in 181 results := result :: !results; 182 if success then incr passed else incr failed 183 end 184 ) tests; 185 let file_result : Report.file_result = { 186 filename = "HTML5lib / " ^ filename; test_type = "Tree Construction"; 187 passed_count = !passed; failed_count = !failed; 188 tests = List.rev !results; 189 } in 190 (file_result, !passed, !failed) 191 192 let run_dir test_dir = 193 if not (Sys.file_exists test_dir) then ([], 0, 0) 194 else begin 195 let files = Sys.readdir test_dir |> Array.to_list in 196 let dat_files = List.filter (fun f -> 197 Filename.check_suffix f ".dat" && not (String.contains f '/') 198 ) files in 199 let total_passed = ref 0 in 200 let total_failed = ref 0 in 201 let file_results = ref [] in 202 List.iter (fun file -> 203 let path = Filename.concat test_dir file in 204 if Sys.is_directory path then () else begin 205 let (file_result, passed, failed) = run_file path in 206 total_passed := !total_passed + passed; 207 total_failed := !total_failed + failed; 208 file_results := file_result :: !file_results 209 end 210 ) (List.sort String.compare dat_files); 211 (List.rev !file_results, !total_passed, !total_failed) 212 end 213 end 214 215 let run base_dir = 216 let tree_dir = Filename.concat base_dir "tree-construction" in 217 Printf.printf " Running tree-construction tests...\n%!"; 218 let (tree_files, tree_passed, tree_failed) = TreeConstruction.run_dir tree_dir in 219 Printf.printf " Tree construction: %d passed, %d failed\n%!" tree_passed tree_failed; 220 221 (* For now, just return tree construction results *) 222 (* Full implementation would include tokenizer, encoding, serializer *) 223 { 224 name = "HTML5lib Tests"; 225 description = "Official html5lib test suite for HTML5 parsing conformance"; 226 passed = tree_passed; 227 failed = tree_failed; 228 files = tree_files; 229 extra_info = [ 230 ("Tree Construction", Printf.sprintf "%d/%d" tree_passed (tree_passed + tree_failed)); 231 ]; 232 } 233end 234 235(* ============================================================ *) 236(* Validator Tests Runner *) 237(* ============================================================ *) 238 239module Validator_runner = struct 240 241 type expected_outcome = Valid | Invalid | HasWarning | Unknown 242 243 type test_file = { 244 path : string; 245 relative_path : string; 246 category : string; 247 expected : expected_outcome; 248 } 249 250 type test_result = { 251 file : test_file; 252 passed : bool; 253 actual_errors : string list; 254 actual_warnings : string list; 255 details : string; 256 match_quality : Expected_message.match_quality option; [@warning "-69"] 257 } 258 259 let parse_outcome filename = 260 if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-isvalid.html" then Valid 261 else if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-novalid.html" then Invalid 262 else if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-haswarn.html" then HasWarning 263 else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-isvalid.xhtml" then Valid 264 else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-novalid.xhtml" then Invalid 265 else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-haswarn.xhtml" then HasWarning 266 else Unknown 267 268 let rec discover_tests_in_dir base_dir current_dir = 269 let full_path = Filename.concat base_dir current_dir in 270 if not (Sys.file_exists full_path) then [] 271 else if Sys.is_directory full_path then begin 272 let entries = Sys.readdir full_path |> Array.to_list in 273 List.concat_map (fun entry -> 274 let sub_path = if current_dir = "" then entry else Filename.concat current_dir entry in 275 discover_tests_in_dir base_dir sub_path 276 ) entries 277 end else if Filename.check_suffix current_dir ".html" || Filename.check_suffix current_dir ".xhtml" then begin 278 let outcome = parse_outcome (Filename.basename current_dir) in 279 if outcome = Unknown then [] 280 else 281 let category = match String.split_on_char '/' current_dir with cat :: _ -> cat | [] -> "unknown" in 282 [{ path = full_path; relative_path = current_dir; category; expected = outcome }] 283 end else [] 284 285 let run_test ~strictness messages test = 286 try 287 let ic = open_in test.path in 288 let content = really_input_string ic (in_channel_length ic) in 289 close_in ic; 290 let reader = Bytesrw.Bytes.Reader.of_string content in 291 let result = Htmlrw_check.check ~collect_parse_errors:true ~system_id:test.relative_path reader in 292 let error_msgs = Htmlrw_check.errors result in 293 let warning_msgs = Htmlrw_check.warnings result in 294 let info_msgs = Htmlrw_check.infos result in 295 let errors = List.map (fun m -> m.Htmlrw_check.text) error_msgs in 296 let warnings = List.map (fun m -> m.Htmlrw_check.text) warning_msgs in 297 let infos = List.map (fun m -> m.Htmlrw_check.text) info_msgs in 298 let expected_msg = Validator_messages.get messages test.relative_path in 299 300 let (passed, details, match_quality) = match test.expected with 301 | Valid -> 302 let no_errors = errors = [] && warnings = [] in 303 let details = if no_errors then "OK" 304 else Printf.sprintf "Expected valid but got %d errors, %d warnings" (List.length errors) (List.length warnings) in 305 (no_errors, details, None) 306 | Invalid -> 307 if errors = [] then 308 (false, "Expected error but got none", None) 309 else begin 310 match expected_msg with 311 | None -> 312 (true, Printf.sprintf "Got %d error(s), no expected message" (List.length errors), None) 313 | Some exp -> 314 let expected = Expected_message.parse exp in 315 let qualities = List.map (fun msg -> Expected_message.matches ~strictness ~expected ~actual:msg) error_msgs in 316 let best = List.fold_left (fun b q -> if q < b then q else b) Expected_message.No_match qualities in 317 let acceptable = Expected_message.is_acceptable ~strictness best in 318 let msg = if acceptable then "Message matched" else "Message mismatch" in 319 (acceptable, msg, Some best) 320 end 321 | HasWarning -> 322 (* For haswarn, check warnings AND infos (like test_validator.ml) *) 323 let all_msgs = warning_msgs @ info_msgs in 324 let all_messages = warnings @ infos in 325 if all_messages = [] && errors = [] then 326 (false, "Expected warning but got none", None) 327 else begin 328 match expected_msg with 329 | None -> 330 if all_messages <> [] then 331 (true, Printf.sprintf "Got %d warning/info message(s)" (List.length all_messages), None) 332 else 333 (true, Printf.sprintf "Got %d error(s) instead of warning" (List.length errors), None) 334 | Some exp -> 335 let expected = Expected_message.parse exp in 336 let qualities = List.map (fun msg -> Expected_message.matches ~strictness ~expected ~actual:msg) all_msgs in 337 let best = List.fold_left (fun b q -> if q < b then q else b) Expected_message.No_match qualities in 338 let acceptable = Expected_message.is_acceptable ~strictness best in 339 if acceptable then 340 (true, "Warning/info matched", Some best) 341 else begin 342 (* Also try matching against errors *) 343 let err_qualities = List.map (fun msg -> Expected_message.matches ~strictness ~expected ~actual:msg) error_msgs in 344 let err_best = List.fold_left (fun b q -> if q < b then q else b) Expected_message.No_match err_qualities in 345 let err_acceptable = Expected_message.is_acceptable ~strictness err_best in 346 if err_acceptable then 347 (true, "Error matched (severity differs)", Some err_best) 348 else 349 let final_best = if best < err_best then best else err_best in 350 (false, "Warning mismatch", Some final_best) 351 end 352 end 353 | Unknown -> (false, "Unknown test type", None) 354 in 355 { file = test; passed; actual_errors = errors; actual_warnings = warnings @ infos; details; match_quality } 356 with e -> 357 { file = test; passed = false; actual_errors = []; actual_warnings = []; 358 details = Printf.sprintf "Exception: %s" (Printexc.to_string e); match_quality = None } 359 360 let run_mode ~mode_name ~strictness messages tests = 361 Printf.printf " Running %s mode...\n%!" mode_name; 362 let total = List.length tests in 363 let results = List.mapi (fun i test -> 364 if (i + 1) mod 500 = 0 then Printf.printf " [%d/%d]\n%!" (i + 1) total; 365 run_test ~strictness messages test 366 ) tests in 367 let passed = List.filter (fun r -> r.passed) results |> List.length in 368 Printf.printf " %s: %d/%d passed\n%!" mode_name passed total; 369 (results, passed, total - passed) 370 371 let results_to_file_results mode_name results = 372 (* Group by category *) 373 let by_category = Hashtbl.create 32 in 374 List.iter (fun r -> 375 let cat = r.file.category in 376 let existing = try Hashtbl.find by_category cat with Not_found -> [] in 377 Hashtbl.replace by_category cat (r :: existing) 378 ) results; 379 380 Hashtbl.fold (fun category tests acc -> 381 let tests = List.rev tests in 382 let passed_count = List.filter (fun r -> r.passed) tests |> List.length in 383 let failed_count = List.length tests - passed_count in 384 let test_results = List.mapi (fun i r -> 385 let outcome_str = match r.file.expected with 386 | Valid -> "isvalid" | Invalid -> "novalid" | HasWarning -> "haswarn" | Unknown -> "unknown" 387 in 388 Report.{ 389 test_num = i + 1; 390 description = Printf.sprintf "[%s] %s" outcome_str (Filename.basename r.file.relative_path); 391 input = r.file.relative_path; 392 expected = (match r.file.expected with 393 | Valid -> "(no errors)" | Invalid -> "(error expected)" | HasWarning -> "(warning expected)" | Unknown -> "?"); 394 actual = String.concat "; " (r.actual_errors @ r.actual_warnings); 395 success = r.passed; 396 details = [("Result", r.details)]; 397 raw_test_data = None; 398 } 399 ) tests in 400 Report.{ 401 filename = Printf.sprintf "Validator / %s [%s]" category mode_name; 402 test_type = "Validator"; 403 passed_count; 404 failed_count; 405 tests = test_results; 406 } :: acc 407 ) by_category [] 408 409 let run tests_dir = 410 Printf.printf " Loading validator messages...\n%!"; 411 let messages_path = Filename.concat tests_dir "messages.json" in 412 let messages = Validator_messages.load messages_path in 413 414 Printf.printf " Discovering test files...\n%!"; 415 let tests = discover_tests_in_dir tests_dir "" in 416 Printf.printf " Found %d test files\n%!" (List.length tests); 417 418 let (lenient_results, lenient_passed, _lenient_failed) = 419 run_mode ~mode_name:"LENIENT" ~strictness:Expected_message.lenient messages tests in 420 let (strict_results, strict_passed, strict_failed) = 421 run_mode ~mode_name:"STRICT" ~strictness:Expected_message.exact_message messages tests in 422 423 let lenient_files = results_to_file_results "Lenient" lenient_results in 424 let strict_files = results_to_file_results "Strict" strict_results in 425 426 let total = List.length tests in 427 { 428 name = "Nu HTML Validator Tests"; 429 description = "W3C Nu HTML Validator conformance tests (both lenient and strict modes)"; 430 passed = strict_passed; (* Use strict as the primary metric *) 431 failed = strict_failed; 432 files = lenient_files @ strict_files; 433 extra_info = [ 434 ("Lenient Mode", Printf.sprintf "%d/%d (%.1f%%)" lenient_passed total 435 (100.0 *. float_of_int lenient_passed /. float_of_int total)); 436 ("Strict Mode", Printf.sprintf "%d/%d (%.1f%%)" strict_passed total 437 (100.0 *. float_of_int strict_passed /. float_of_int total)); 438 ("Total Tests", string_of_int total); 439 ]; 440 } 441end 442 443(* ============================================================ *) 444(* Main Entry Point *) 445(* ============================================================ *) 446 447let get_timestamp () = 448 let now = Unix.gettimeofday () in 449 let tm = Unix.localtime now in 450 Printf.sprintf "%04d-%02d-%02d %02d:%02d:%02d" 451 (tm.Unix.tm_year + 1900) (tm.Unix.tm_mon + 1) tm.Unix.tm_mday 452 tm.Unix.tm_hour tm.Unix.tm_min tm.Unix.tm_sec 453 454let () = 455 let html5lib_dir = ref "html5lib-tests" in 456 let validator_dir = ref "validator/tests" in 457 let output_file = ref "comprehensive_test_report.html" in 458 459 (* Parse args *) 460 let args = Array.to_list Sys.argv |> List.tl in 461 (match args with 462 | [h; v; o] -> html5lib_dir := h; validator_dir := v; output_file := o 463 | [h; v] -> html5lib_dir := h; validator_dir := v 464 | [h] -> html5lib_dir := h 465 | _ -> ()); 466 467 Printf.printf "=== Comprehensive HTML5rw Test Suite ===\n\n%!"; 468 469 let all_suites = ref [] in 470 let total_passed = ref 0 in 471 let total_failed = ref 0 in 472 473 (* Run HTML5lib tests *) 474 Printf.printf "Running HTML5lib tests from %s...\n%!" !html5lib_dir; 475 if Sys.file_exists !html5lib_dir then begin 476 let suite = Html5lib_runner.run !html5lib_dir in 477 all_suites := suite :: !all_suites; 478 total_passed := !total_passed + suite.passed; 479 total_failed := !total_failed + suite.failed; 480 Printf.printf " Subtotal: %d passed, %d failed\n\n%!" suite.passed suite.failed 481 end else 482 Printf.printf " (directory not found)\n\n%!"; 483 484 (* Run Validator tests *) 485 Printf.printf "Running Validator tests from %s...\n%!" !validator_dir; 486 if Sys.file_exists !validator_dir then begin 487 let suite = Validator_runner.run !validator_dir in 488 all_suites := suite :: !all_suites; 489 total_passed := !total_passed + suite.passed; 490 total_failed := !total_failed + suite.failed; 491 Printf.printf " Subtotal: %d passed, %d failed\n\n%!" suite.passed suite.failed 492 end else 493 Printf.printf " (directory not found)\n\n%!"; 494 495 Printf.printf "=== Overall Summary ===\n"; 496 Printf.printf "Total: %d passed, %d failed\n\n%!" !total_passed !total_failed; 497 498 (* Combine all file results *) 499 let all_files = List.concat_map (fun s -> s.files) (List.rev !all_suites) in 500 501 (* Build description with all suite info as HTML *) 502 let suites_info = List.rev !all_suites |> List.map (fun s -> 503 let extras = String.concat ", " (List.map (fun (k, v) -> Printf.sprintf "%s: %s" k v) s.extra_info) in 504 Printf.sprintf "<li><strong>%s:</strong> %d/%d passed — %s</li>" s.name s.passed (s.passed + s.failed) extras 505 ) |> String.concat "\n" in 506 507 let description = Printf.sprintf 508 "Comprehensive test report for the html5rw OCaml HTML5 parser and validator library.</p>\ 509 <p><strong>Test Suites:</strong></p><ul>%s</ul><p>\ 510 This report combines results from multiple test suites to provide complete coverage analysis." 511 suites_info 512 in 513 514 let report : Report.report = { 515 title = "html5rw Comprehensive Test Report"; 516 test_type = "comprehensive"; 517 description; 518 files = all_files; 519 total_passed = !total_passed; 520 total_failed = !total_failed; 521 match_quality = None; 522 test_type_breakdown = None; 523 strictness_mode = Some "Comprehensive (all modes)"; 524 run_timestamp = Some (get_timestamp ()); 525 } in 526 527 Report.generate_report report !output_file; 528 529 exit (if !total_failed > 0 then 1 else 0)