OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** Test runner for Nu HTML Validator test suite 2 3 This validates HTML5 documents against the upstream Nu HTML Validator test suite. 4 Tests are classified by filename suffix: 5 - `-isvalid.html` : Should produce no errors or warnings 6 - `-novalid.html` : Should produce at least one error 7 - `-haswarn.html` : Should produce at least one warning 8*) 9 10module Report = Test_report 11 12type expected_outcome = 13 | Valid (** -isvalid.html: expect no errors *) 14 | Invalid (** -novalid.html: expect error matching messages.json *) 15 | HasWarning (** -haswarn.html: expect warning matching messages.json *) 16 | Unknown (** Unknown suffix *) 17 18type test_file = { 19 path : string; (** Full filesystem path *) 20 relative_path : string; (** Path relative to tests/, used as key in messages.json *) 21 category : string; (** html, html-aria, etc. *) 22 expected : expected_outcome; 23} 24 25type test_result = { 26 file : test_file; 27 passed : bool; 28 actual_errors : string list; 29 actual_warnings : string list; 30 actual_infos : string list; 31 expected_message : string option; 32 details : string; 33} 34 35(** Parse expected outcome from filename suffix *) 36let parse_outcome filename = 37 (* Check for .html suffix *) 38 if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-isvalid.html" then 39 Valid 40 else if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-novalid.html" then 41 Invalid 42 else if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-haswarn.html" then 43 HasWarning 44 (* Check for .xhtml suffix *) 45 else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-isvalid.xhtml" then 46 Valid 47 else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-novalid.xhtml" then 48 Invalid 49 else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-haswarn.xhtml" then 50 HasWarning 51 else 52 Unknown 53 54(** Normalize Unicode curly quotes to ASCII *) 55let normalize_quotes s = 56 let buf = Buffer.create (String.length s) in 57 let i = ref 0 in 58 while !i < String.length s do 59 let c = s.[!i] in 60 (* Check for UTF-8 sequences for curly quotes *) 61 if !i + 2 < String.length s && c = '\xe2' then begin 62 let c1 = s.[!i + 1] in 63 let c2 = s.[!i + 2] in 64 if c1 = '\x80' && (c2 = '\x9c' || c2 = '\x9d') then begin 65 (* U+201C or U+201D -> ASCII quote *) 66 Buffer.add_char buf '"'; 67 i := !i + 3 68 end else begin 69 Buffer.add_char buf c; 70 incr i 71 end 72 end else begin 73 Buffer.add_char buf c; 74 incr i 75 end 76 done; 77 Buffer.contents buf 78 79(** Check if actual message matches expected (flexible matching) *) 80let message_matches ~expected ~actual = 81 let expected_norm = normalize_quotes expected in 82 let actual_norm = normalize_quotes actual in 83 (* Exact match *) 84 actual_norm = expected_norm || 85 (* Substring match *) 86 try 87 let _ = Str.search_forward (Str.regexp_string expected_norm) actual_norm 0 in 88 true 89 with Not_found -> 90 false 91 92(** Recursively find all HTML test files *) 93let rec discover_tests_in_dir base_dir current_dir = 94 let full_path = Filename.concat base_dir current_dir in 95 if not (Sys.file_exists full_path) then [] 96 else if Sys.is_directory full_path then begin 97 let entries = Sys.readdir full_path |> Array.to_list in 98 List.concat_map (fun entry -> 99 let sub_path = if current_dir = "" then entry else Filename.concat current_dir entry in 100 discover_tests_in_dir base_dir sub_path 101 ) entries 102 end else if Filename.check_suffix current_dir ".html" || Filename.check_suffix current_dir ".xhtml" then begin 103 let outcome = parse_outcome (Filename.basename current_dir) in 104 if outcome = Unknown then [] 105 else 106 let category = 107 match String.split_on_char '/' current_dir with 108 | cat :: _ -> cat 109 | [] -> "unknown" 110 in 111 [{ path = full_path; relative_path = current_dir; category; expected = outcome }] 112 end else 113 [] 114 115let discover_tests tests_dir = 116 discover_tests_in_dir tests_dir "" 117 118(** Run a single test *) 119let run_test messages test = 120 try 121 let ic = open_in test.path in 122 let content = really_input_string ic (in_channel_length ic) in 123 close_in ic; 124 125 let reader = Bytesrw.Bytes.Reader.of_string content in 126 let result = Html5_checker.check ~collect_parse_errors:true ~system_id:test.relative_path reader in 127 128 let errors = Html5_checker.errors result |> List.map (fun m -> m.Html5_checker.Message.message) in 129 let warnings = Html5_checker.warnings result |> List.map (fun m -> m.Html5_checker.Message.message) in 130 let infos = Html5_checker.infos result |> List.map (fun m -> m.Html5_checker.Message.message) in 131 let expected_msg = Validator_messages.get messages test.relative_path in 132 133 let (passed, details) = match test.expected with 134 | Valid -> 135 (* isvalid tests fail on errors or warnings, but info messages are OK *) 136 if errors = [] && warnings = [] then 137 (true, if infos = [] then "OK: No messages" else Printf.sprintf "OK: No errors/warnings (%d info)" (List.length infos)) 138 else 139 (false, Printf.sprintf "Expected valid but got %d errors, %d warnings" 140 (List.length errors) (List.length warnings)) 141 | Invalid -> 142 if errors = [] then 143 (false, "Expected error but got none") 144 else begin 145 (* For novalid tests, require EXACT message match when expected message is provided *) 146 match expected_msg with 147 | None -> 148 (* No expected message - pass if any error detected *) 149 (true, Printf.sprintf "Got %d error(s), no expected message to match" (List.length errors)) 150 | Some exp -> 151 if List.exists (fun actual -> message_matches ~expected:exp ~actual) errors then 152 (true, Printf.sprintf "Got %d error(s), message matched" (List.length errors)) 153 else 154 (* FAIL if message doesn't match - we want exact matching *) 155 (false, Printf.sprintf "Message mismatch.\n Expected: %s\n Got: %s" 156 exp (String.concat "\n " errors)) 157 end 158 | HasWarning -> 159 (* For haswarn, require message match against warnings or infos *) 160 let all_messages = warnings @ infos in 161 if all_messages = [] && errors = [] then 162 (false, "Expected warning but got none") 163 else begin 164 match expected_msg with 165 | None -> 166 if all_messages <> [] then 167 (true, Printf.sprintf "Got %d warning/info message(s)" (List.length all_messages)) 168 else 169 (true, Printf.sprintf "Got %d error(s) instead of warning" (List.length errors)) 170 | Some exp -> 171 if List.exists (fun actual -> message_matches ~expected:exp ~actual) all_messages then 172 (true, Printf.sprintf "Got %d warning/info message(s), matched" (List.length all_messages)) 173 else if List.exists (fun actual -> message_matches ~expected:exp ~actual) errors then 174 (* Accept error if message matches (severity might differ) *) 175 (true, Printf.sprintf "Got error instead of warning, but message matched") 176 else 177 (false, Printf.sprintf "Message mismatch.\n Expected: %s\n Got warnings: %s\n Got errors: %s" 178 exp (String.concat "\n " (if all_messages = [] then ["(none)"] else all_messages)) 179 (String.concat "\n " (if errors = [] then ["(none)"] else errors))) 180 end 181 | Unknown -> 182 (false, "Unknown test type") 183 in 184 { file = test; passed; actual_errors = errors; actual_warnings = warnings; 185 actual_infos = infos; expected_message = expected_msg; details } 186 with e -> 187 { file = test; passed = false; actual_errors = []; actual_warnings = []; 188 actual_infos = []; expected_message = None; details = Printf.sprintf "Exception: %s" (Printexc.to_string e) } 189 190(** Group tests by category *) 191let group_by_category tests = 192 let tbl = Hashtbl.create 16 in 193 List.iter (fun test -> 194 let cat = test.file.category in 195 let existing = try Hashtbl.find tbl cat with Not_found -> [] in 196 Hashtbl.replace tbl cat (test :: existing) 197 ) tests; 198 Hashtbl.fold (fun k v acc -> (k, List.rev v) :: acc) tbl [] 199 |> List.sort (fun (a, _) (b, _) -> String.compare a b) 200 201(** Print summary to console *) 202let print_summary results = 203 let by_category = group_by_category results in 204 Printf.printf "\n=== Results by Category ===\n"; 205 List.iter (fun (cat, tests) -> 206 let passed = List.filter (fun r -> r.passed) tests |> List.length in 207 let total = List.length tests in 208 Printf.printf "%s: %d/%d passed (%.1f%%)\n" cat passed total 209 (100.0 *. float_of_int passed /. float_of_int (max 1 total)) 210 ) by_category; 211 212 (* Breakdown by test type *) 213 let isvalid_results = List.filter (fun r -> r.file.expected = Valid) results in 214 let novalid_results = List.filter (fun r -> r.file.expected = Invalid) results in 215 let haswarn_results = List.filter (fun r -> r.file.expected = HasWarning) results in 216 217 let count_passed rs = List.filter (fun r -> r.passed) rs |> List.length in 218 219 Printf.printf "\n=== Results by Test Type ===\n"; 220 Printf.printf "isvalid (no errors expected): %d/%d passed (%.1f%%)\n" 221 (count_passed isvalid_results) (List.length isvalid_results) 222 (100.0 *. float_of_int (count_passed isvalid_results) /. float_of_int (max 1 (List.length isvalid_results))); 223 Printf.printf "novalid (errors expected): %d/%d passed (%.1f%%)\n" 224 (count_passed novalid_results) (List.length novalid_results) 225 (100.0 *. float_of_int (count_passed novalid_results) /. float_of_int (max 1 (List.length novalid_results))); 226 Printf.printf "haswarn (warnings expected): %d/%d passed (%.1f%%)\n" 227 (count_passed haswarn_results) (List.length haswarn_results) 228 (100.0 *. float_of_int (count_passed haswarn_results) /. float_of_int (max 1 (List.length haswarn_results))); 229 230 let total_passed = List.filter (fun r -> r.passed) results |> List.length in 231 let total = List.length results in 232 Printf.printf "\n=== Overall ===\n"; 233 Printf.printf "Total: %d/%d passed (%.1f%%)\n" total_passed total 234 (100.0 *. float_of_int total_passed /. float_of_int (max 1 total)) 235 236(** Generate HTML report *) 237let generate_html_report results output_path = 238 let by_category = group_by_category results in 239 240 let file_results = List.map (fun (category, tests) -> 241 let passed_count = List.filter (fun r -> r.passed) tests |> List.length in 242 let failed_count = List.length tests - passed_count in 243 let test_results = List.mapi (fun i r -> 244 let outcome_str = match r.file.expected with 245 | Valid -> "valid" 246 | Invalid -> "invalid" 247 | HasWarning -> "has-warning" 248 | Unknown -> "unknown" 249 in 250 let description = Printf.sprintf "[%s] %s" outcome_str r.file.relative_path in 251 let expected = match r.expected_message with 252 | Some m -> m 253 | None -> "(no expected message)" 254 in 255 let actual_str = 256 let errors = if r.actual_errors = [] then "" 257 else "Errors:\n" ^ String.concat "\n" r.actual_errors in 258 let warnings = if r.actual_warnings = [] then "" 259 else "Warnings:\n" ^ String.concat "\n" r.actual_warnings in 260 let infos = if r.actual_infos = [] then "" 261 else "Info:\n" ^ String.concat "\n" r.actual_infos in 262 if errors = "" && warnings = "" && infos = "" then "(no messages)" 263 else String.trim (errors ^ "\n" ^ warnings ^ "\n" ^ infos) 264 in 265 Report.{ 266 test_num = i + 1; 267 description; 268 input = r.file.relative_path; 269 expected; 270 actual = actual_str; 271 success = r.passed; 272 details = [("Status", r.details)]; 273 raw_test_data = None; 274 } 275 ) tests in 276 Report.{ 277 filename = category; 278 test_type = "HTML5 Validator"; 279 passed_count; 280 failed_count; 281 tests = test_results; 282 } 283 ) by_category in 284 285 let total_passed = List.filter (fun r -> r.passed) results |> List.length in 286 let total_failed = List.length results - total_passed in 287 288 let report : Report.report = { 289 title = "Nu HTML Validator Tests"; 290 test_type = "validator"; 291 description = "Tests from the Nu HTML Validator (W3C's official HTML checker). \ 292 Tests validate HTML5 conformance including element nesting, required attributes, \ 293 ARIA roles, obsolete elements, and more. Each test file is classified by suffix: \ 294 -isvalid.html (should produce no errors), -novalid.html (should produce errors), \ 295 -haswarn.html (should produce warnings)."; 296 files = file_results; 297 total_passed; 298 total_failed; 299 } in 300 Report.generate_report report output_path 301 302let () = 303 let tests_dir = if Array.length Sys.argv > 1 then Sys.argv.(1) else "validator/tests" in 304 let report_path = if Array.length Sys.argv > 2 then Sys.argv.(2) else "test_validator_report.html" in 305 306 Printf.printf "Loading messages.json...\n%!"; 307 let messages_path = Filename.concat tests_dir "messages.json" in 308 let messages = Validator_messages.load messages_path in 309 Printf.printf "Loaded %d expected messages\n%!" (Validator_messages.count messages); 310 311 Printf.printf "Discovering test files...\n%!"; 312 let tests = discover_tests tests_dir in 313 Printf.printf "Found %d test files\n%!" (List.length tests); 314 315 Printf.printf "Running tests...\n%!"; 316 let results = List.map (run_test messages) tests in 317 318 (* Print failing isvalid tests *) 319 let failing_isvalid = List.filter (fun r -> 320 r.file.expected = Valid && not r.passed 321 ) results in 322 if failing_isvalid <> [] then begin 323 Printf.printf "\n=== Failing isvalid tests ===\n"; 324 List.iter (fun r -> 325 Printf.printf "%s: %s\n" r.file.relative_path r.details 326 ) failing_isvalid 327 end; 328 329 (* Print failing haswarn tests *) 330 let failing_haswarn = List.filter (fun r -> 331 r.file.expected = HasWarning && not r.passed 332 ) results in 333 if failing_haswarn <> [] then begin 334 Printf.printf "\n=== Failing haswarn tests ===\n"; 335 List.iter (fun r -> 336 Printf.printf "%s\n" r.file.relative_path 337 ) failing_haswarn 338 end; 339 340 (* Print failing novalid tests *) 341 let failing_novalid = List.filter (fun r -> 342 r.file.expected = Invalid && not r.passed 343 ) results in 344 if failing_novalid <> [] then begin 345 Printf.printf "\n=== Failing novalid tests (first 50) ===\n"; 346 List.iteri (fun i r -> 347 if i < 50 then Printf.printf "%s\n" r.file.relative_path 348 ) failing_novalid 349 end; 350 351 print_summary results; 352 generate_html_report results report_path; 353 354 let failed_count = List.filter (fun r -> not r.passed) results |> List.length in 355 exit (if failed_count > 0 then 1 else 0)