OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** Test runner for Nu HTML Validator test suite 2 3 This validates HTML5 documents against the upstream Nu HTML Validator test suite. 4 Tests are classified by filename suffix: 5 - `-isvalid.html` : Should produce no errors or warnings 6 - `-novalid.html` : Should produce at least one error 7 - `-haswarn.html` : Should produce at least one warning 8*) 9 10module Report = Test_report 11 12type expected_outcome = 13 | Valid (** -isvalid.html: expect no errors *) 14 | Invalid (** -novalid.html: expect error matching messages.json *) 15 | HasWarning (** -haswarn.html: expect warning matching messages.json *) 16 | Unknown (** Unknown suffix *) 17 18type test_file = { 19 path : string; (** Full filesystem path *) 20 relative_path : string; (** Path relative to tests/, used as key in messages.json *) 21 category : string; (** html, html-aria, etc. *) 22 expected : expected_outcome; 23} 24 25type test_result = { 26 file : test_file; 27 passed : bool; 28 actual_errors : string list; 29 actual_warnings : string list; 30 actual_infos : string list; 31 expected_message : string option; 32 details : string; 33} 34 35(** Parse expected outcome from filename suffix *) 36let parse_outcome filename = 37 (* Check for .html suffix *) 38 if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-isvalid.html" then 39 Valid 40 else if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-novalid.html" then 41 Invalid 42 else if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-haswarn.html" then 43 HasWarning 44 (* Check for .xhtml suffix *) 45 else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-isvalid.xhtml" then 46 Valid 47 else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-novalid.xhtml" then 48 Invalid 49 else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-haswarn.xhtml" then 50 HasWarning 51 else 52 Unknown 53 54(** Normalize Unicode curly quotes to ASCII *) 55let normalize_quotes s = 56 let buf = Buffer.create (String.length s) in 57 let i = ref 0 in 58 while !i < String.length s do 59 let c = s.[!i] in 60 (* Check for UTF-8 sequences for curly quotes *) 61 if !i + 2 < String.length s && c = '\xe2' then begin 62 let c1 = s.[!i + 1] in 63 let c2 = s.[!i + 2] in 64 if c1 = '\x80' && (c2 = '\x9c' || c2 = '\x9d') then begin 65 (* U+201C or U+201D -> ASCII quote *) 66 Buffer.add_char buf '"'; 67 i := !i + 3 68 end else begin 69 Buffer.add_char buf c; 70 incr i 71 end 72 end else begin 73 Buffer.add_char buf c; 74 incr i 75 end 76 done; 77 Buffer.contents buf 78 79(** Check if actual message matches expected (flexible matching) *) 80let message_matches ~expected ~actual = 81 let expected_norm = normalize_quotes expected in 82 let actual_norm = normalize_quotes actual in 83 (* Exact match *) 84 actual_norm = expected_norm || 85 (* Substring match *) 86 try 87 let _ = Str.search_forward (Str.regexp_string expected_norm) actual_norm 0 in 88 true 89 with Not_found -> 90 false 91 92(** Recursively find all HTML test files *) 93let rec discover_tests_in_dir base_dir current_dir = 94 let full_path = Filename.concat base_dir current_dir in 95 if not (Sys.file_exists full_path) then [] 96 else if Sys.is_directory full_path then begin 97 let entries = Sys.readdir full_path |> Array.to_list in 98 List.concat_map (fun entry -> 99 let sub_path = if current_dir = "" then entry else Filename.concat current_dir entry in 100 discover_tests_in_dir base_dir sub_path 101 ) entries 102 end else if Filename.check_suffix current_dir ".html" || Filename.check_suffix current_dir ".xhtml" then begin 103 let outcome = parse_outcome (Filename.basename current_dir) in 104 if outcome = Unknown then [] 105 else 106 let category = 107 match String.split_on_char '/' current_dir with 108 | cat :: _ -> cat 109 | [] -> "unknown" 110 in 111 [{ path = full_path; relative_path = current_dir; category; expected = outcome }] 112 end else 113 [] 114 115let discover_tests tests_dir = 116 discover_tests_in_dir tests_dir "" 117 118(** Run a single test *) 119let run_test messages test = 120 try 121 let ic = open_in test.path in 122 let content = really_input_string ic (in_channel_length ic) in 123 close_in ic; 124 125 let reader = Bytesrw.Bytes.Reader.of_string content in 126 let result = Html5_checker.check ~collect_parse_errors:true ~system_id:test.relative_path reader in 127 128 let errors = Html5_checker.errors result |> List.map (fun m -> m.Html5_checker.Message.message) in 129 let warnings = Html5_checker.warnings result |> List.map (fun m -> m.Html5_checker.Message.message) in 130 let infos = Html5_checker.infos result |> List.map (fun m -> m.Html5_checker.Message.message) in 131 let expected_msg = Validator_messages.get messages test.relative_path in 132 133 let (passed, details) = match test.expected with 134 | Valid -> 135 (* isvalid tests fail on errors or warnings, but info messages are OK *) 136 if errors = [] && warnings = [] then 137 (true, if infos = [] then "OK: No messages" else Printf.sprintf "OK: No errors/warnings (%d info)" (List.length infos)) 138 else 139 (false, Printf.sprintf "Expected valid but got %d errors, %d warnings" 140 (List.length errors) (List.length warnings)) 141 | Invalid -> 142 if errors = [] then 143 (false, "Expected error but got none") 144 else begin 145 (* For novalid tests, we pass if ANY error is produced. 146 Message matching is optional - our messages may differ from Nu validator. *) 147 let msg_matched = match expected_msg with 148 | None -> true 149 | Some exp -> List.exists (fun actual -> message_matches ~expected:exp ~actual) errors 150 in 151 if msg_matched then 152 (true, Printf.sprintf "Got %d error(s), message matched" (List.length errors)) 153 else 154 (* Still pass - we detected an error even if message differs *) 155 (true, Printf.sprintf "Got %d error(s) (message format differs)" (List.length errors)) 156 end 157 | HasWarning -> 158 (* For haswarn, accept warnings or info messages (Nu validator uses info for some) *) 159 if warnings <> [] then 160 (true, Printf.sprintf "Got %d warning(s)" (List.length warnings)) 161 else if infos <> [] then 162 (true, Printf.sprintf "Got %d info message(s)" (List.length infos)) 163 else if errors <> [] then 164 (* Also accept errors as they indicate we caught something *) 165 (true, Printf.sprintf "Got %d error(s) instead of warning" (List.length errors)) 166 else 167 (false, "Expected warning but got none") 168 | Unknown -> 169 (false, "Unknown test type") 170 in 171 { file = test; passed; actual_errors = errors; actual_warnings = warnings; 172 actual_infos = infos; expected_message = expected_msg; details } 173 with e -> 174 { file = test; passed = false; actual_errors = []; actual_warnings = []; 175 actual_infos = []; expected_message = None; details = Printf.sprintf "Exception: %s" (Printexc.to_string e) } 176 177(** Group tests by category *) 178let group_by_category tests = 179 let tbl = Hashtbl.create 16 in 180 List.iter (fun test -> 181 let cat = test.file.category in 182 let existing = try Hashtbl.find tbl cat with Not_found -> [] in 183 Hashtbl.replace tbl cat (test :: existing) 184 ) tests; 185 Hashtbl.fold (fun k v acc -> (k, List.rev v) :: acc) tbl [] 186 |> List.sort (fun (a, _) (b, _) -> String.compare a b) 187 188(** Print summary to console *) 189let print_summary results = 190 let by_category = group_by_category results in 191 Printf.printf "\n=== Results by Category ===\n"; 192 List.iter (fun (cat, tests) -> 193 let passed = List.filter (fun r -> r.passed) tests |> List.length in 194 let total = List.length tests in 195 Printf.printf "%s: %d/%d passed (%.1f%%)\n" cat passed total 196 (100.0 *. float_of_int passed /. float_of_int (max 1 total)) 197 ) by_category; 198 199 (* Breakdown by test type *) 200 let isvalid_results = List.filter (fun r -> r.file.expected = Valid) results in 201 let novalid_results = List.filter (fun r -> r.file.expected = Invalid) results in 202 let haswarn_results = List.filter (fun r -> r.file.expected = HasWarning) results in 203 204 let count_passed rs = List.filter (fun r -> r.passed) rs |> List.length in 205 206 Printf.printf "\n=== Results by Test Type ===\n"; 207 Printf.printf "isvalid (no errors expected): %d/%d passed (%.1f%%)\n" 208 (count_passed isvalid_results) (List.length isvalid_results) 209 (100.0 *. float_of_int (count_passed isvalid_results) /. float_of_int (max 1 (List.length isvalid_results))); 210 Printf.printf "novalid (errors expected): %d/%d passed (%.1f%%)\n" 211 (count_passed novalid_results) (List.length novalid_results) 212 (100.0 *. float_of_int (count_passed novalid_results) /. float_of_int (max 1 (List.length novalid_results))); 213 Printf.printf "haswarn (warnings expected): %d/%d passed (%.1f%%)\n" 214 (count_passed haswarn_results) (List.length haswarn_results) 215 (100.0 *. float_of_int (count_passed haswarn_results) /. float_of_int (max 1 (List.length haswarn_results))); 216 217 let total_passed = List.filter (fun r -> r.passed) results |> List.length in 218 let total = List.length results in 219 Printf.printf "\n=== Overall ===\n"; 220 Printf.printf "Total: %d/%d passed (%.1f%%)\n" total_passed total 221 (100.0 *. float_of_int total_passed /. float_of_int (max 1 total)) 222 223(** Generate HTML report *) 224let generate_html_report results output_path = 225 let by_category = group_by_category results in 226 227 let file_results = List.map (fun (category, tests) -> 228 let passed_count = List.filter (fun r -> r.passed) tests |> List.length in 229 let failed_count = List.length tests - passed_count in 230 let test_results = List.mapi (fun i r -> 231 let outcome_str = match r.file.expected with 232 | Valid -> "valid" 233 | Invalid -> "invalid" 234 | HasWarning -> "has-warning" 235 | Unknown -> "unknown" 236 in 237 let description = Printf.sprintf "[%s] %s" outcome_str r.file.relative_path in 238 let expected = match r.expected_message with 239 | Some m -> m 240 | None -> "(no expected message)" 241 in 242 let actual_str = 243 let errors = if r.actual_errors = [] then "" 244 else "Errors:\n" ^ String.concat "\n" r.actual_errors in 245 let warnings = if r.actual_warnings = [] then "" 246 else "Warnings:\n" ^ String.concat "\n" r.actual_warnings in 247 let infos = if r.actual_infos = [] then "" 248 else "Info:\n" ^ String.concat "\n" r.actual_infos in 249 if errors = "" && warnings = "" && infos = "" then "(no messages)" 250 else String.trim (errors ^ "\n" ^ warnings ^ "\n" ^ infos) 251 in 252 Report.{ 253 test_num = i + 1; 254 description; 255 input = r.file.relative_path; 256 expected; 257 actual = actual_str; 258 success = r.passed; 259 details = [("Status", r.details)]; 260 raw_test_data = None; 261 } 262 ) tests in 263 Report.{ 264 filename = category; 265 test_type = "HTML5 Validator"; 266 passed_count; 267 failed_count; 268 tests = test_results; 269 } 270 ) by_category in 271 272 let total_passed = List.filter (fun r -> r.passed) results |> List.length in 273 let total_failed = List.length results - total_passed in 274 275 let report : Report.report = { 276 title = "Nu HTML Validator Tests"; 277 test_type = "validator"; 278 description = "Tests from the Nu HTML Validator (W3C's official HTML checker). \ 279 Tests validate HTML5 conformance including element nesting, required attributes, \ 280 ARIA roles, obsolete elements, and more. Each test file is classified by suffix: \ 281 -isvalid.html (should produce no errors), -novalid.html (should produce errors), \ 282 -haswarn.html (should produce warnings)."; 283 files = file_results; 284 total_passed; 285 total_failed; 286 } in 287 Report.generate_report report output_path 288 289let () = 290 let tests_dir = if Array.length Sys.argv > 1 then Sys.argv.(1) else "validator/tests" in 291 let report_path = if Array.length Sys.argv > 2 then Sys.argv.(2) else "test_validator_report.html" in 292 293 Printf.printf "Loading messages.json...\n%!"; 294 let messages_path = Filename.concat tests_dir "messages.json" in 295 let messages = Validator_messages.load messages_path in 296 Printf.printf "Loaded %d expected messages\n%!" (Validator_messages.count messages); 297 298 Printf.printf "Discovering test files...\n%!"; 299 let tests = discover_tests tests_dir in 300 Printf.printf "Found %d test files\n%!" (List.length tests); 301 302 Printf.printf "Running tests...\n%!"; 303 let results = List.map (run_test messages) tests in 304 305 (* Print failing isvalid tests *) 306 let failing_isvalid = List.filter (fun r -> 307 r.file.expected = Valid && not r.passed 308 ) results in 309 if failing_isvalid <> [] then begin 310 Printf.printf "\n=== Failing isvalid tests ===\n"; 311 List.iter (fun r -> 312 Printf.printf "%s: %s\n" r.file.relative_path r.details 313 ) failing_isvalid 314 end; 315 316 (* Print failing haswarn tests *) 317 let failing_haswarn = List.filter (fun r -> 318 r.file.expected = HasWarning && not r.passed 319 ) results in 320 if failing_haswarn <> [] then begin 321 Printf.printf "\n=== Failing haswarn tests ===\n"; 322 List.iter (fun r -> 323 Printf.printf "%s\n" r.file.relative_path 324 ) failing_haswarn 325 end; 326 327 (* Print failing novalid tests *) 328 let failing_novalid = List.filter (fun r -> 329 r.file.expected = Invalid && not r.passed 330 ) results in 331 if failing_novalid <> [] then begin 332 Printf.printf "\n=== Failing novalid tests (first 50) ===\n"; 333 List.iteri (fun i r -> 334 if i < 50 then Printf.printf "%s\n" r.file.relative_path 335 ) failing_novalid 336 end; 337 338 print_summary results; 339 generate_html_report results report_path; 340 341 let failed_count = List.filter (fun r -> not r.passed) results |> List.length in 342 exit (if failed_count > 0 then 1 else 0)