OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** Test runner for Nu HTML Validator test suite
2
3 This validates HTML5 documents against the upstream Nu HTML Validator test suite.
4 Tests are classified by filename suffix:
5 - `-isvalid.html` : Should produce no errors or warnings
6 - `-novalid.html` : Should produce at least one error
7 - `-haswarn.html` : Should produce at least one warning
8*)
9
10module Report = Test_report
11
12type expected_outcome =
13 | Valid (** -isvalid.html: expect no errors *)
14 | Invalid (** -novalid.html: expect error matching messages.json *)
15 | HasWarning (** -haswarn.html: expect warning matching messages.json *)
16 | Unknown (** Unknown suffix *)
17
18type test_file = {
19 path : string; (** Full filesystem path *)
20 relative_path : string; (** Path relative to tests/, used as key in messages.json *)
21 category : string; (** html, html-aria, etc. *)
22 expected : expected_outcome;
23}
24
25type test_result = {
26 file : test_file;
27 passed : bool;
28 actual_errors : string list;
29 actual_warnings : string list;
30 actual_infos : string list;
31 expected_message : string option;
32 details : string;
33}
34
35(** Parse expected outcome from filename suffix *)
36let parse_outcome filename =
37 (* Check for .html suffix *)
38 if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-isvalid.html" then
39 Valid
40 else if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-novalid.html" then
41 Invalid
42 else if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-haswarn.html" then
43 HasWarning
44 (* Check for .xhtml suffix *)
45 else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-isvalid.xhtml" then
46 Valid
47 else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-novalid.xhtml" then
48 Invalid
49 else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-haswarn.xhtml" then
50 HasWarning
51 else
52 Unknown
53
54(** Normalize Unicode curly quotes to ASCII *)
55let normalize_quotes s =
56 let buf = Buffer.create (String.length s) in
57 let i = ref 0 in
58 while !i < String.length s do
59 let c = s.[!i] in
60 (* Check for UTF-8 sequences for curly quotes *)
61 if !i + 2 < String.length s && c = '\xe2' then begin
62 let c1 = s.[!i + 1] in
63 let c2 = s.[!i + 2] in
64 if c1 = '\x80' && (c2 = '\x9c' || c2 = '\x9d') then begin
65 (* U+201C or U+201D -> ASCII quote *)
66 Buffer.add_char buf '"';
67 i := !i + 3
68 end else begin
69 Buffer.add_char buf c;
70 incr i
71 end
72 end else begin
73 Buffer.add_char buf c;
74 incr i
75 end
76 done;
77 Buffer.contents buf
78
79(** Check if actual message matches expected (flexible matching) *)
80let message_matches ~expected ~actual =
81 let expected_norm = normalize_quotes expected in
82 let actual_norm = normalize_quotes actual in
83 (* Exact match *)
84 actual_norm = expected_norm ||
85 (* Substring match *)
86 try
87 let _ = Str.search_forward (Str.regexp_string expected_norm) actual_norm 0 in
88 true
89 with Not_found ->
90 false
91
92(** Recursively find all HTML test files *)
93let rec discover_tests_in_dir base_dir current_dir =
94 let full_path = Filename.concat base_dir current_dir in
95 if not (Sys.file_exists full_path) then []
96 else if Sys.is_directory full_path then begin
97 let entries = Sys.readdir full_path |> Array.to_list in
98 List.concat_map (fun entry ->
99 let sub_path = if current_dir = "" then entry else Filename.concat current_dir entry in
100 discover_tests_in_dir base_dir sub_path
101 ) entries
102 end else if Filename.check_suffix current_dir ".html" || Filename.check_suffix current_dir ".xhtml" then begin
103 let outcome = parse_outcome (Filename.basename current_dir) in
104 if outcome = Unknown then []
105 else
106 let category =
107 match String.split_on_char '/' current_dir with
108 | cat :: _ -> cat
109 | [] -> "unknown"
110 in
111 [{ path = full_path; relative_path = current_dir; category; expected = outcome }]
112 end else
113 []
114
115let discover_tests tests_dir =
116 discover_tests_in_dir tests_dir ""
117
118(** Run a single test *)
119let run_test messages test =
120 try
121 let ic = open_in test.path in
122 let content = really_input_string ic (in_channel_length ic) in
123 close_in ic;
124
125 let reader = Bytesrw.Bytes.Reader.of_string content in
126 let result = Html5_checker.check ~collect_parse_errors:true ~system_id:test.relative_path reader in
127
128 let errors = Html5_checker.errors result |> List.map (fun m -> m.Html5_checker.Message.message) in
129 let warnings = Html5_checker.warnings result |> List.map (fun m -> m.Html5_checker.Message.message) in
130 let infos = Html5_checker.infos result |> List.map (fun m -> m.Html5_checker.Message.message) in
131 let expected_msg = Validator_messages.get messages test.relative_path in
132
133 let (passed, details) = match test.expected with
134 | Valid ->
135 (* isvalid tests fail on errors or warnings, but info messages are OK *)
136 if errors = [] && warnings = [] then
137 (true, if infos = [] then "OK: No messages" else Printf.sprintf "OK: No errors/warnings (%d info)" (List.length infos))
138 else
139 (false, Printf.sprintf "Expected valid but got %d errors, %d warnings"
140 (List.length errors) (List.length warnings))
141 | Invalid ->
142 if errors = [] then
143 (false, "Expected error but got none")
144 else begin
145 (* For novalid tests, require EXACT message match when expected message is provided *)
146 match expected_msg with
147 | None ->
148 (* No expected message - pass if any error detected *)
149 (true, Printf.sprintf "Got %d error(s), no expected message to match" (List.length errors))
150 | Some exp ->
151 if List.exists (fun actual -> message_matches ~expected:exp ~actual) errors then
152 (true, Printf.sprintf "Got %d error(s), message matched" (List.length errors))
153 else
154 (* FAIL if message doesn't match - we want exact matching *)
155 (false, Printf.sprintf "Message mismatch.\n Expected: %s\n Got: %s"
156 exp (String.concat "\n " errors))
157 end
158 | HasWarning ->
159 (* For haswarn, require message match against warnings or infos *)
160 let all_messages = warnings @ infos in
161 if all_messages = [] && errors = [] then
162 (false, "Expected warning but got none")
163 else begin
164 match expected_msg with
165 | None ->
166 if all_messages <> [] then
167 (true, Printf.sprintf "Got %d warning/info message(s)" (List.length all_messages))
168 else
169 (true, Printf.sprintf "Got %d error(s) instead of warning" (List.length errors))
170 | Some exp ->
171 if List.exists (fun actual -> message_matches ~expected:exp ~actual) all_messages then
172 (true, Printf.sprintf "Got %d warning/info message(s), matched" (List.length all_messages))
173 else if List.exists (fun actual -> message_matches ~expected:exp ~actual) errors then
174 (* Accept error if message matches (severity might differ) *)
175 (true, Printf.sprintf "Got error instead of warning, but message matched")
176 else
177 (false, Printf.sprintf "Message mismatch.\n Expected: %s\n Got warnings: %s\n Got errors: %s"
178 exp (String.concat "\n " (if all_messages = [] then ["(none)"] else all_messages))
179 (String.concat "\n " (if errors = [] then ["(none)"] else errors)))
180 end
181 | Unknown ->
182 (false, "Unknown test type")
183 in
184 { file = test; passed; actual_errors = errors; actual_warnings = warnings;
185 actual_infos = infos; expected_message = expected_msg; details }
186 with e ->
187 { file = test; passed = false; actual_errors = []; actual_warnings = [];
188 actual_infos = []; expected_message = None; details = Printf.sprintf "Exception: %s" (Printexc.to_string e) }
189
190(** Group tests by category *)
191let group_by_category tests =
192 let tbl = Hashtbl.create 16 in
193 List.iter (fun test ->
194 let cat = test.file.category in
195 let existing = try Hashtbl.find tbl cat with Not_found -> [] in
196 Hashtbl.replace tbl cat (test :: existing)
197 ) tests;
198 Hashtbl.fold (fun k v acc -> (k, List.rev v) :: acc) tbl []
199 |> List.sort (fun (a, _) (b, _) -> String.compare a b)
200
201(** Print summary to console *)
202let print_summary results =
203 let by_category = group_by_category results in
204 Printf.printf "\n=== Results by Category ===\n";
205 List.iter (fun (cat, tests) ->
206 let passed = List.filter (fun r -> r.passed) tests |> List.length in
207 let total = List.length tests in
208 Printf.printf "%s: %d/%d passed (%.1f%%)\n" cat passed total
209 (100.0 *. float_of_int passed /. float_of_int (max 1 total))
210 ) by_category;
211
212 (* Breakdown by test type *)
213 let isvalid_results = List.filter (fun r -> r.file.expected = Valid) results in
214 let novalid_results = List.filter (fun r -> r.file.expected = Invalid) results in
215 let haswarn_results = List.filter (fun r -> r.file.expected = HasWarning) results in
216
217 let count_passed rs = List.filter (fun r -> r.passed) rs |> List.length in
218
219 Printf.printf "\n=== Results by Test Type ===\n";
220 Printf.printf "isvalid (no errors expected): %d/%d passed (%.1f%%)\n"
221 (count_passed isvalid_results) (List.length isvalid_results)
222 (100.0 *. float_of_int (count_passed isvalid_results) /. float_of_int (max 1 (List.length isvalid_results)));
223 Printf.printf "novalid (errors expected): %d/%d passed (%.1f%%)\n"
224 (count_passed novalid_results) (List.length novalid_results)
225 (100.0 *. float_of_int (count_passed novalid_results) /. float_of_int (max 1 (List.length novalid_results)));
226 Printf.printf "haswarn (warnings expected): %d/%d passed (%.1f%%)\n"
227 (count_passed haswarn_results) (List.length haswarn_results)
228 (100.0 *. float_of_int (count_passed haswarn_results) /. float_of_int (max 1 (List.length haswarn_results)));
229
230 let total_passed = List.filter (fun r -> r.passed) results |> List.length in
231 let total = List.length results in
232 Printf.printf "\n=== Overall ===\n";
233 Printf.printf "Total: %d/%d passed (%.1f%%)\n" total_passed total
234 (100.0 *. float_of_int total_passed /. float_of_int (max 1 total))
235
236(** Generate HTML report *)
237let generate_html_report results output_path =
238 let by_category = group_by_category results in
239
240 let file_results = List.map (fun (category, tests) ->
241 let passed_count = List.filter (fun r -> r.passed) tests |> List.length in
242 let failed_count = List.length tests - passed_count in
243 let test_results = List.mapi (fun i r ->
244 let outcome_str = match r.file.expected with
245 | Valid -> "valid"
246 | Invalid -> "invalid"
247 | HasWarning -> "has-warning"
248 | Unknown -> "unknown"
249 in
250 let description = Printf.sprintf "[%s] %s" outcome_str r.file.relative_path in
251 let expected = match r.expected_message with
252 | Some m -> m
253 | None -> "(no expected message)"
254 in
255 let actual_str =
256 let errors = if r.actual_errors = [] then ""
257 else "Errors:\n" ^ String.concat "\n" r.actual_errors in
258 let warnings = if r.actual_warnings = [] then ""
259 else "Warnings:\n" ^ String.concat "\n" r.actual_warnings in
260 let infos = if r.actual_infos = [] then ""
261 else "Info:\n" ^ String.concat "\n" r.actual_infos in
262 if errors = "" && warnings = "" && infos = "" then "(no messages)"
263 else String.trim (errors ^ "\n" ^ warnings ^ "\n" ^ infos)
264 in
265 Report.{
266 test_num = i + 1;
267 description;
268 input = r.file.relative_path;
269 expected;
270 actual = actual_str;
271 success = r.passed;
272 details = [("Status", r.details)];
273 raw_test_data = None;
274 }
275 ) tests in
276 Report.{
277 filename = category;
278 test_type = "HTML5 Validator";
279 passed_count;
280 failed_count;
281 tests = test_results;
282 }
283 ) by_category in
284
285 let total_passed = List.filter (fun r -> r.passed) results |> List.length in
286 let total_failed = List.length results - total_passed in
287
288 let report : Report.report = {
289 title = "Nu HTML Validator Tests";
290 test_type = "validator";
291 description = "Tests from the Nu HTML Validator (W3C's official HTML checker). \
292 Tests validate HTML5 conformance including element nesting, required attributes, \
293 ARIA roles, obsolete elements, and more. Each test file is classified by suffix: \
294 -isvalid.html (should produce no errors), -novalid.html (should produce errors), \
295 -haswarn.html (should produce warnings).";
296 files = file_results;
297 total_passed;
298 total_failed;
299 } in
300 Report.generate_report report output_path
301
302let () =
303 let tests_dir = if Array.length Sys.argv > 1 then Sys.argv.(1) else "validator/tests" in
304 let report_path = if Array.length Sys.argv > 2 then Sys.argv.(2) else "test_validator_report.html" in
305
306 Printf.printf "Loading messages.json...\n%!";
307 let messages_path = Filename.concat tests_dir "messages.json" in
308 let messages = Validator_messages.load messages_path in
309 Printf.printf "Loaded %d expected messages\n%!" (Validator_messages.count messages);
310
311 Printf.printf "Discovering test files...\n%!";
312 let tests = discover_tests tests_dir in
313 Printf.printf "Found %d test files\n%!" (List.length tests);
314
315 Printf.printf "Running tests...\n%!";
316 let results = List.map (run_test messages) tests in
317
318 (* Print failing isvalid tests *)
319 let failing_isvalid = List.filter (fun r ->
320 r.file.expected = Valid && not r.passed
321 ) results in
322 if failing_isvalid <> [] then begin
323 Printf.printf "\n=== Failing isvalid tests ===\n";
324 List.iter (fun r ->
325 Printf.printf "%s: %s\n" r.file.relative_path r.details
326 ) failing_isvalid
327 end;
328
329 (* Print failing haswarn tests *)
330 let failing_haswarn = List.filter (fun r ->
331 r.file.expected = HasWarning && not r.passed
332 ) results in
333 if failing_haswarn <> [] then begin
334 Printf.printf "\n=== Failing haswarn tests ===\n";
335 List.iter (fun r ->
336 Printf.printf "%s\n" r.file.relative_path
337 ) failing_haswarn
338 end;
339
340 (* Print failing novalid tests *)
341 let failing_novalid = List.filter (fun r ->
342 r.file.expected = Invalid && not r.passed
343 ) results in
344 if failing_novalid <> [] then begin
345 Printf.printf "\n=== Failing novalid tests (first 50) ===\n";
346 List.iteri (fun i r ->
347 if i < 50 then Printf.printf "%s\n" r.file.relative_path
348 ) failing_novalid
349 end;
350
351 print_summary results;
352 generate_html_report results report_path;
353
354 let failed_count = List.filter (fun r -> not r.passed) results |> List.length in
355 exit (if failed_count > 0 then 1 else 0)