OCaml HTML5 parser/serialiser based on Python's JustHTML

fix

Changed files
+255 -39
lib
test
+6 -2
lib/check/error_code.ml
··· 374 Printf.sprintf "Element %s is missing one or more of the following attributes: [%s]." 375 (q element) attrs_str 376 | `Attr (`Bad_value (`Elem element, `Attr attr, `Value value, `Reason reason)) -> 377 - Printf.sprintf "Bad value %s for attribute %s on element %s: %s" 378 - (q value) (q attr) (q element) reason 379 | `Attr (`Bad_value_generic (`Message message)) -> message 380 | `Attr (`Duplicate_id (`Id id)) -> 381 Printf.sprintf "Duplicate ID %s." (q id)
··· 374 Printf.sprintf "Element %s is missing one or more of the following attributes: [%s]." 375 (q element) attrs_str 376 | `Attr (`Bad_value (`Elem element, `Attr attr, `Value value, `Reason reason)) -> 377 + if reason = "" then 378 + Printf.sprintf "Bad value %s for attribute %s on element %s." 379 + (q value) (q attr) (q element) 380 + else 381 + Printf.sprintf "Bad value %s for attribute %s on element %s: %s" 382 + (q value) (q attr) (q element) reason 383 | `Attr (`Bad_value_generic (`Message message)) -> message 384 | `Attr (`Duplicate_id (`Id id)) -> 385 Printf.sprintf "Duplicate ID %s." (q id)
+249 -37
test/test_validator.ml
··· 402 } in 403 Report.generate_report report output_path 404 405 - let () = 406 - (* Parse command line arguments *) 407 - let args = Array.to_list Sys.argv |> List.tl in 408 - let is_strict = List.mem "--strict" args in 409 - let non_flag_args = List.filter (fun s -> not (String.length s > 0 && s.[0] = '-')) args in 410 - let tests_dir = match non_flag_args with x :: _ -> x | [] -> "validator/tests" in 411 - let report_path = match non_flag_args with _ :: x :: _ -> x | _ -> "test_validator_report.html" in 412 - 413 - (* Apply strict mode if requested - use exact_message which requires exact text but not typed codes *) 414 - if is_strict then begin 415 - strictness := Expected_message.exact_message; 416 - Printf.printf "Running in STRICT mode (exact message matching required)\n%!" 417 - end; 418 - 419 - Printf.printf "Loading messages.json...\n%!"; 420 - let messages_path = Filename.concat tests_dir "messages.json" in 421 - let messages = Validator_messages.load messages_path in 422 - Printf.printf "Loaded %d expected messages\n%!" (Validator_messages.count messages); 423 - 424 - Printf.printf "Discovering test files...\n%!"; 425 - let tests = discover_tests tests_dir in 426 - Printf.printf "Found %d test files\n%!" (List.length tests); 427 - 428 - Printf.printf "Running tests...\n%!"; 429 let total = List.length tests in 430 let results = List.mapi (fun i test -> 431 Printf.printf "\r[%d/%d] %s%!" (i + 1) total test.relative_path; 432 run_test messages test 433 ) tests in 434 Printf.printf "\n%!"; 435 436 - (* Print failing isvalid tests *) 437 let failing_isvalid = List.filter (fun r -> 438 r.file.expected = Valid && not r.passed 439 ) results in 440 if failing_isvalid <> [] then begin 441 - Printf.printf "\n=== Failing isvalid tests ===\n"; 442 List.iter (fun r -> 443 - Printf.printf "%s: %s\n" r.file.relative_path r.details 444 ) failing_isvalid 445 end; 446 447 - (* Print failing haswarn tests *) 448 let failing_haswarn = List.filter (fun r -> 449 r.file.expected = HasWarning && not r.passed 450 ) results in 451 if failing_haswarn <> [] then begin 452 - Printf.printf "\n=== Failing haswarn tests ===\n"; 453 List.iter (fun r -> 454 - Printf.printf "%s\n" r.file.relative_path 455 ) failing_haswarn 456 end; 457 458 - (* Print failing novalid tests *) 459 let failing_novalid = List.filter (fun r -> 460 r.file.expected = Invalid && not r.passed 461 ) results in 462 if failing_novalid <> [] then begin 463 - Printf.printf "\n=== Failing novalid tests (first 50) ===\n"; 464 List.iteri (fun i r -> 465 - if i < 50 then Printf.printf "%s\n" r.file.relative_path 466 ) failing_novalid 467 end; 468 469 - print_summary results; 470 - generate_html_report results report_path; 471 472 - let failed_count = List.filter (fun r -> not r.passed) results |> List.length in 473 - exit (if failed_count > 0 then 1 else 0)
··· 402 } in 403 Report.generate_report report output_path 404 405 + (** Run tests with a given strictness and return results *) 406 + let run_all_tests ~mode_name ~strictness_setting messages tests = 407 + strictness := strictness_setting; 408 + Printf.printf "\n=== Running in %s mode ===\n%!" mode_name; 409 let total = List.length tests in 410 let results = List.mapi (fun i test -> 411 Printf.printf "\r[%d/%d] %s%!" (i + 1) total test.relative_path; 412 run_test messages test 413 ) tests in 414 Printf.printf "\n%!"; 415 + results 416 417 + (** Print failures for a test run *) 418 + let print_failures mode_name results = 419 + Printf.printf "\n--- %s mode results ---\n" mode_name; 420 + 421 let failing_isvalid = List.filter (fun r -> 422 r.file.expected = Valid && not r.passed 423 ) results in 424 if failing_isvalid <> [] then begin 425 + Printf.printf "Failing isvalid tests:\n"; 426 List.iter (fun r -> 427 + Printf.printf " %s: %s\n" r.file.relative_path r.details 428 ) failing_isvalid 429 end; 430 431 let failing_haswarn = List.filter (fun r -> 432 r.file.expected = HasWarning && not r.passed 433 ) results in 434 if failing_haswarn <> [] then begin 435 + Printf.printf "Failing haswarn tests:\n"; 436 List.iter (fun r -> 437 + Printf.printf " %s\n" r.file.relative_path 438 ) failing_haswarn 439 end; 440 441 let failing_novalid = List.filter (fun r -> 442 r.file.expected = Invalid && not r.passed 443 ) results in 444 if failing_novalid <> [] then begin 445 + Printf.printf "Failing novalid tests (first 20):\n"; 446 List.iteri (fun i r -> 447 + if i < 20 then Printf.printf " %s\n" r.file.relative_path 448 ) failing_novalid 449 end; 450 451 + let passed = List.filter (fun r -> r.passed) results |> List.length in 452 + let total = List.length results in 453 + Printf.printf "%s: %d/%d passed (%.1f%%)\n%!" mode_name passed total 454 + (100.0 *. float_of_int passed /. float_of_int total) 455 + 456 + (** Generate combined HTML report for both modes *) 457 + let generate_combined_html_report ~lenient_results ~strict_results output_path = 458 + (* Helper to build file results from a set of results *) 459 + let build_file_results results = 460 + let by_category = group_by_category results in 461 + List.map (fun (category, tests) -> 462 + let passed_count = List.filter (fun r -> r.passed) tests |> List.length in 463 + let failed_count = List.length tests - passed_count in 464 + let test_results = List.mapi (fun i r -> 465 + let outcome_str = match r.file.expected with 466 + | Valid -> "isvalid" 467 + | Invalid -> "novalid" 468 + | HasWarning -> "haswarn" 469 + | Unknown -> "unknown" 470 + in 471 + let description = Printf.sprintf "[%s] %s" outcome_str (Filename.basename r.file.relative_path) in 472 + let expected = match r.expected_message with 473 + | Some m -> m 474 + | None -> match r.file.expected with 475 + | Valid -> "(should produce no errors or warnings)" 476 + | Invalid -> "(should produce at least one error)" 477 + | HasWarning -> "(should produce at least one warning)" 478 + | Unknown -> "(unknown test type)" 479 + in 480 + let actual_str = 481 + let errors = if r.actual_errors = [] then "" 482 + else "Errors:\n • " ^ String.concat "\n • " r.actual_errors in 483 + let warnings = if r.actual_warnings = [] then "" 484 + else "Warnings:\n • " ^ String.concat "\n • " r.actual_warnings in 485 + let infos = if r.actual_infos = [] then "" 486 + else "Info:\n • " ^ String.concat "\n • " r.actual_infos in 487 + if errors = "" && warnings = "" && infos = "" then "(no messages produced)" 488 + else String.trim (errors ^ (if errors <> "" && warnings <> "" then "\n\n" else "") ^ 489 + warnings ^ (if (errors <> "" || warnings <> "") && infos <> "" then "\n\n" else "") ^ 490 + infos) 491 + in 492 + let match_quality_str = match r.match_quality with 493 + | Some q -> Expected_message.match_quality_to_string q 494 + | None -> "N/A" 495 + in 496 + Report.{ 497 + test_num = i + 1; 498 + description; 499 + input = r.file.relative_path; 500 + expected; 501 + actual = actual_str; 502 + success = r.passed; 503 + details = [ 504 + ("Result", r.details); 505 + ("Match Quality", match_quality_str); 506 + ]; 507 + raw_test_data = read_html_source r.file.path; 508 + } 509 + ) tests in 510 + Report.{ 511 + filename = category; 512 + test_type = "HTML5 Validator"; 513 + passed_count; 514 + failed_count; 515 + tests = test_results; 516 + } 517 + ) by_category 518 + in 519 520 + let compute_stats results mode_name = 521 + let total_passed = List.filter (fun r -> r.passed) results |> List.length in 522 + let total_failed = List.length results - total_passed in 523 + let count_quality q = List.filter (fun r -> 524 + match r.match_quality with Some mq -> mq = q | None -> false 525 + ) results |> List.length in 526 + let match_quality_stats : Report.match_quality_stats = { 527 + exact_matches = count_quality Expected_message.Exact_match; 528 + code_matches = count_quality Expected_message.Code_match; 529 + message_matches = count_quality Expected_message.Message_match; 530 + substring_matches = count_quality Expected_message.Substring_match; 531 + severity_mismatches = count_quality Expected_message.Severity_mismatch; 532 + no_matches = count_quality Expected_message.No_match; 533 + not_applicable = List.filter (fun r -> r.match_quality = None) results |> List.length; 534 + } in 535 + let isvalid_results = List.filter (fun r -> r.file.expected = Valid) results in 536 + let novalid_results = List.filter (fun r -> r.file.expected = Invalid) results in 537 + let haswarn_results = List.filter (fun r -> r.file.expected = HasWarning) results in 538 + let count_passed rs = List.filter (fun r -> r.passed) rs |> List.length in 539 + let test_type_stats : Report.test_type_stats = { 540 + isvalid_passed = count_passed isvalid_results; 541 + isvalid_total = List.length isvalid_results; 542 + novalid_passed = count_passed novalid_results; 543 + novalid_total = List.length novalid_results; 544 + haswarn_passed = count_passed haswarn_results; 545 + haswarn_total = List.length haswarn_results; 546 + } in 547 + (total_passed, total_failed, match_quality_stats, test_type_stats, mode_name) 548 + in 549 + 550 + let lenient_stats = compute_stats lenient_results "lenient" in 551 + let strict_stats = compute_stats strict_results "strict" in 552 + 553 + (* Use strict results for the main report, but include both in description *) 554 + let (strict_passed, strict_failed, strict_mq, strict_tt, _) = strict_stats in 555 + let (lenient_passed, _lenient_failed, _, _, _) = lenient_stats in 556 + 557 + let now = Unix.gettimeofday () in 558 + let tm = Unix.localtime now in 559 + let timestamp = Printf.sprintf "%04d-%02d-%02d %02d:%02d:%02d" 560 + (tm.Unix.tm_year + 1900) (tm.Unix.tm_mon + 1) tm.Unix.tm_mday 561 + tm.Unix.tm_hour tm.Unix.tm_min tm.Unix.tm_sec in 562 + 563 + let total = List.length strict_results in 564 + let description = Printf.sprintf 565 + "Tests from the Nu HTML Validator (W3C's official HTML checker). \ 566 + Tests validate HTML5 conformance including element nesting, required attributes, \ 567 + ARIA roles, obsolete elements, and more.\n\n\ 568 + LENIENT mode: %d/%d passed (%.1f%%) - allows substring matching\n\ 569 + STRICT mode: %d/%d passed (%.1f%%) - requires exact message matching" 570 + lenient_passed total (100.0 *. float_of_int lenient_passed /. float_of_int total) 571 + strict_passed total (100.0 *. float_of_int strict_passed /. float_of_int total) 572 + in 573 + 574 + let report : Report.report = { 575 + title = "Nu HTML Validator Tests (Lenient + Strict)"; 576 + test_type = "validator"; 577 + description; 578 + files = build_file_results strict_results; (* Show strict results in detail *) 579 + total_passed = strict_passed; 580 + total_failed = strict_failed; 581 + match_quality = Some strict_mq; 582 + test_type_breakdown = Some strict_tt; 583 + strictness_mode = Some (Printf.sprintf "BOTH (Lenient: %d/%d, Strict: %d/%d)" 584 + lenient_passed total strict_passed total); 585 + run_timestamp = Some timestamp; 586 + } in 587 + Report.generate_report report output_path 588 + 589 + let () = 590 + (* Parse command line arguments *) 591 + let args = Array.to_list Sys.argv |> List.tl in 592 + let is_strict = List.mem "--strict" args in 593 + let is_both = List.mem "--both" args in 594 + let non_flag_args = List.filter (fun s -> not (String.length s > 0 && s.[0] = '-')) args in 595 + let tests_dir = match non_flag_args with x :: _ -> x | [] -> "validator/tests" in 596 + let report_path = match non_flag_args with _ :: x :: _ -> x | _ -> "test_validator_report.html" in 597 + 598 + Printf.printf "Loading messages.json...\n%!"; 599 + let messages_path = Filename.concat tests_dir "messages.json" in 600 + let messages = Validator_messages.load messages_path in 601 + Printf.printf "Loaded %d expected messages\n%!" (Validator_messages.count messages); 602 + 603 + Printf.printf "Discovering test files...\n%!"; 604 + let tests = discover_tests tests_dir in 605 + Printf.printf "Found %d test files\n%!" (List.length tests); 606 + 607 + if is_both then begin 608 + (* Run both modes *) 609 + let lenient_results = run_all_tests ~mode_name:"LENIENT" 610 + ~strictness_setting:Expected_message.lenient messages tests in 611 + let strict_results = run_all_tests ~mode_name:"STRICT" 612 + ~strictness_setting:Expected_message.exact_message messages tests in 613 + 614 + print_failures "LENIENT" lenient_results; 615 + print_failures "STRICT" strict_results; 616 + 617 + Printf.printf "\n=== Summary ===\n"; 618 + let lenient_passed = List.filter (fun r -> r.passed) lenient_results |> List.length in 619 + let strict_passed = List.filter (fun r -> r.passed) strict_results |> List.length in 620 + let total = List.length tests in 621 + Printf.printf "LENIENT: %d/%d (%.1f%%)\n" lenient_passed total 622 + (100.0 *. float_of_int lenient_passed /. float_of_int total); 623 + Printf.printf "STRICT: %d/%d (%.1f%%)\n" strict_passed total 624 + (100.0 *. float_of_int strict_passed /. float_of_int total); 625 + 626 + generate_combined_html_report ~lenient_results ~strict_results report_path; 627 + Printf.printf "\nHTML report written to: %s\n" report_path; 628 + 629 + (* Exit with error if strict mode has failures *) 630 + let strict_failed = List.filter (fun r -> not r.passed) strict_results |> List.length in 631 + exit (if strict_failed > 0 then 1 else 0) 632 + end else begin 633 + (* Single mode (original behavior) *) 634 + if is_strict then begin 635 + strictness := Expected_message.exact_message; 636 + Printf.printf "Running in STRICT mode (exact message matching required)\n%!" 637 + end; 638 + 639 + Printf.printf "Running tests...\n%!"; 640 + let total = List.length tests in 641 + let results = List.mapi (fun i test -> 642 + Printf.printf "\r[%d/%d] %s%!" (i + 1) total test.relative_path; 643 + run_test messages test 644 + ) tests in 645 + Printf.printf "\n%!"; 646 + 647 + (* Print failing isvalid tests *) 648 + let failing_isvalid = List.filter (fun r -> 649 + r.file.expected = Valid && not r.passed 650 + ) results in 651 + if failing_isvalid <> [] then begin 652 + Printf.printf "\n=== Failing isvalid tests ===\n"; 653 + List.iter (fun r -> 654 + Printf.printf "%s: %s\n" r.file.relative_path r.details 655 + ) failing_isvalid 656 + end; 657 + 658 + (* Print failing haswarn tests *) 659 + let failing_haswarn = List.filter (fun r -> 660 + r.file.expected = HasWarning && not r.passed 661 + ) results in 662 + if failing_haswarn <> [] then begin 663 + Printf.printf "\n=== Failing haswarn tests ===\n"; 664 + List.iter (fun r -> 665 + Printf.printf "%s\n" r.file.relative_path 666 + ) failing_haswarn 667 + end; 668 + 669 + (* Print failing novalid tests *) 670 + let failing_novalid = List.filter (fun r -> 671 + r.file.expected = Invalid && not r.passed 672 + ) results in 673 + if failing_novalid <> [] then begin 674 + Printf.printf "\n=== Failing novalid tests (first 50) ===\n"; 675 + List.iteri (fun i r -> 676 + if i < 50 then Printf.printf "%s\n" r.file.relative_path 677 + ) failing_novalid 678 + end; 679 + 680 + print_summary results; 681 + generate_html_report results report_path; 682 + 683 + let failed_count = List.filter (fun r -> not r.passed) results |> List.length in 684 + exit (if failed_count > 0 then 1 else 0) 685 + end