OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* Comprehensive test runner for all html5rw tests
2
3 Generates a single standalone HTML report combining:
4 - HTML5lib tree-construction tests
5 - HTML5lib tokenizer tests
6 - HTML5lib encoding tests
7 - HTML5lib serializer tests
8 - Nu HTML Validator tests (both lenient and strict modes)
9 - Roundtrip tests
10*)
11
12module Report = Test_report
13
14(* ============================================================ *)
15(* Test Suite Summary Types *)
16(* ============================================================ *)
17
18type suite_summary = {
19 name : string;
20 description : string; [@warning "-69"]
21 passed : int;
22 failed : int;
23 files : Report.file_result list;
24 extra_info : (string * string) list;
25}
26
27(* ============================================================ *)
28(* HTML5lib Tests Runner *)
29(* ============================================================ *)
30
31module Html5lib_runner = struct
32 (* Delegate to test_all.ml implementation by running the tests inline *)
33
34 open Bytesrw
35
36 (* Tree Construction Tests *)
37 module TreeConstruction = struct
38 module Parser = Html5rw.Parser
39 module Dom = Html5rw.Dom
40
41 type test_case = {
42 input : string;
43 expected_tree : string;
44 expected_errors : string list;
45 script_on : bool;
46 fragment_context : string option;
47 raw_lines : string;
48 }
49
50 let parse_test_case lines =
51 let raw_lines = String.concat "\n" lines in
52 let rec parse acc = function
53 | [] -> acc
54 | line :: rest when String.length line > 0 && line.[0] = '#' ->
55 let section = String.trim line in
56 let content, remaining = collect_section rest in
57 parse ((section, content) :: acc) remaining
58 | _ :: rest -> parse acc rest
59 and collect_section lines =
60 let rec loop acc = function
61 | [] -> (List.rev acc, [])
62 | line :: rest when String.length line > 0 && line.[0] = '#' ->
63 (List.rev acc, line :: rest)
64 | line :: rest -> loop (line :: acc) rest
65 in
66 loop [] lines
67 in
68 let sections = parse [] lines in
69 let get_section name =
70 match List.assoc_opt name sections with
71 | Some lines -> String.concat "\n" lines
72 | None -> ""
73 in
74 let data = get_section "#data" in
75 let document = get_section "#document" in
76 let errors_text = get_section "#errors" in
77 let errors =
78 String.split_on_char '\n' errors_text
79 |> List.filter (fun s -> String.trim s <> "")
80 in
81 let script_on = List.mem_assoc "#script-on" sections in
82 let fragment =
83 if List.mem_assoc "#document-fragment" sections then
84 Some (get_section "#document-fragment" |> String.trim)
85 else None
86 in
87 { input = data; expected_tree = document; expected_errors = errors;
88 script_on; fragment_context = fragment; raw_lines }
89
90 let parse_dat_file content =
91 let lines = String.split_on_char '\n' content in
92 let rec split_tests current acc = function
93 | [] ->
94 if current = [] then List.rev acc
95 else List.rev (List.rev current :: acc)
96 | "" :: "#data" :: rest ->
97 let new_acc = if current = [] then acc else (List.rev current :: acc) in
98 split_tests ["#data"] new_acc rest
99 | line :: rest ->
100 split_tests (line :: current) acc rest
101 in
102 let test_groups = split_tests [] [] lines in
103 List.filter_map (fun lines ->
104 if List.exists (fun l -> l = "#data") lines then
105 Some (parse_test_case lines)
106 else None
107 ) test_groups
108
109 let strip_tree_prefix s =
110 let lines = String.split_on_char '\n' s in
111 let stripped = List.filter_map (fun line ->
112 if String.length line >= 2 && String.sub line 0 2 = "| " then
113 Some (String.sub line 2 (String.length line - 2))
114 else if String.trim line = "" then None
115 else Some line
116 ) lines in
117 String.concat "\n" stripped
118
119 let normalize_tree s =
120 let lines = String.split_on_char '\n' s in
121 let non_empty = List.filter (fun l -> String.trim l <> "") lines in
122 String.concat "\n" non_empty
123
124 let run_test test =
125 try
126 let result =
127 match test.fragment_context with
128 | Some ctx_str ->
129 let (namespace, tag_name) =
130 match String.split_on_char ' ' ctx_str with
131 | [ns; tag] when ns = "svg" -> (Some "svg", tag)
132 | [ns; tag] when ns = "math" -> (Some "mathml", tag)
133 | [tag] -> (None, tag)
134 | _ -> (None, ctx_str)
135 in
136 let context = Html5rw.Parser.make_fragment_context ~tag_name ~namespace () in
137 let reader = Bytes.Reader.of_string test.input in
138 Html5rw.Parser.parse ~collect_errors:true ~fragment_context:context reader
139 | None ->
140 let reader = Bytes.Reader.of_string test.input in
141 Html5rw.Parser.parse ~collect_errors:true reader
142 in
143 let actual_tree = Html5rw.Dom.to_test_format (Html5rw.Parser.root result) in
144 let expected = normalize_tree (strip_tree_prefix test.expected_tree) in
145 let actual = normalize_tree (strip_tree_prefix actual_tree) in
146 let error_count = List.length (Html5rw.Parser.errors result) in
147 let expected_error_count = List.length test.expected_errors in
148 (expected = actual, expected, actual, error_count, expected_error_count)
149 with e ->
150 let expected = normalize_tree (strip_tree_prefix test.expected_tree) in
151 (false, expected, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), 0, 0)
152
153 let run_file path =
154 let ic = open_in path in
155 let content = really_input_string ic (in_channel_length ic) in
156 close_in ic;
157 let tests = parse_dat_file content in
158 let filename = Filename.basename path in
159 let passed = ref 0 in
160 let failed = ref 0 in
161 let results = ref [] in
162 List.iteri (fun i test ->
163 if test.script_on then ()
164 else begin
165 let (success, expected, actual, actual_error_count, expected_error_count) = run_test test in
166 let description =
167 let input_preview = String.sub test.input 0 (min 60 (String.length test.input)) in
168 if test.fragment_context <> None then
169 Printf.sprintf "Fragment (%s): %s" (Option.get test.fragment_context) input_preview
170 else input_preview
171 in
172 let result : Report.test_result = {
173 test_num = i + 1; description; input = test.input; expected; actual; success;
174 details = [
175 ("Fragment Context", Option.value test.fragment_context ~default:"(none)");
176 ("Expected Errors", string_of_int expected_error_count);
177 ("Actual Errors", string_of_int actual_error_count);
178 ];
179 raw_test_data = Some test.raw_lines;
180 } in
181 results := result :: !results;
182 if success then incr passed else incr failed
183 end
184 ) tests;
185 let file_result : Report.file_result = {
186 filename = "HTML5lib / " ^ filename; test_type = "Tree Construction";
187 passed_count = !passed; failed_count = !failed;
188 tests = List.rev !results;
189 } in
190 (file_result, !passed, !failed)
191
192 let run_dir test_dir =
193 if not (Sys.file_exists test_dir) then ([], 0, 0)
194 else begin
195 let files = Sys.readdir test_dir |> Array.to_list in
196 let dat_files = List.filter (fun f ->
197 Filename.check_suffix f ".dat" && not (String.contains f '/')
198 ) files in
199 let total_passed = ref 0 in
200 let total_failed = ref 0 in
201 let file_results = ref [] in
202 List.iter (fun file ->
203 let path = Filename.concat test_dir file in
204 if Sys.is_directory path then () else begin
205 let (file_result, passed, failed) = run_file path in
206 total_passed := !total_passed + passed;
207 total_failed := !total_failed + failed;
208 file_results := file_result :: !file_results
209 end
210 ) (List.sort String.compare dat_files);
211 (List.rev !file_results, !total_passed, !total_failed)
212 end
213 end
214
215 let run base_dir =
216 let tree_dir = Filename.concat base_dir "tree-construction" in
217 Printf.printf " Running tree-construction tests...\n%!";
218 let (tree_files, tree_passed, tree_failed) = TreeConstruction.run_dir tree_dir in
219 Printf.printf " Tree construction: %d passed, %d failed\n%!" tree_passed tree_failed;
220
221 (* For now, just return tree construction results *)
222 (* Full implementation would include tokenizer, encoding, serializer *)
223 {
224 name = "HTML5lib Tests";
225 description = "Official html5lib test suite for HTML5 parsing conformance";
226 passed = tree_passed;
227 failed = tree_failed;
228 files = tree_files;
229 extra_info = [
230 ("Tree Construction", Printf.sprintf "%d/%d" tree_passed (tree_passed + tree_failed));
231 ];
232 }
233end
234
235(* ============================================================ *)
236(* Validator Tests Runner *)
237(* ============================================================ *)
238
239module Validator_runner = struct
240
241 type expected_outcome = Valid | Invalid | HasWarning | Unknown
242
243 type test_file = {
244 path : string;
245 relative_path : string;
246 category : string;
247 expected : expected_outcome;
248 }
249
250 type test_result = {
251 file : test_file;
252 passed : bool;
253 actual_errors : string list;
254 actual_warnings : string list;
255 details : string;
256 match_quality : Expected_message.match_quality option; [@warning "-69"]
257 }
258
259 let parse_outcome filename =
260 if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-isvalid.html" then Valid
261 else if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-novalid.html" then Invalid
262 else if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-haswarn.html" then HasWarning
263 else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-isvalid.xhtml" then Valid
264 else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-novalid.xhtml" then Invalid
265 else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-haswarn.xhtml" then HasWarning
266 else Unknown
267
268 let rec discover_tests_in_dir base_dir current_dir =
269 let full_path = Filename.concat base_dir current_dir in
270 if not (Sys.file_exists full_path) then []
271 else if Sys.is_directory full_path then begin
272 let entries = Sys.readdir full_path |> Array.to_list in
273 List.concat_map (fun entry ->
274 let sub_path = if current_dir = "" then entry else Filename.concat current_dir entry in
275 discover_tests_in_dir base_dir sub_path
276 ) entries
277 end else if Filename.check_suffix current_dir ".html" || Filename.check_suffix current_dir ".xhtml" then begin
278 let outcome = parse_outcome (Filename.basename current_dir) in
279 if outcome = Unknown then []
280 else
281 let category = match String.split_on_char '/' current_dir with cat :: _ -> cat | [] -> "unknown" in
282 [{ path = full_path; relative_path = current_dir; category; expected = outcome }]
283 end else []
284
285 let run_test ~strictness messages test =
286 try
287 let ic = open_in test.path in
288 let content = really_input_string ic (in_channel_length ic) in
289 close_in ic;
290 let reader = Bytesrw.Bytes.Reader.of_string content in
291 let result = Htmlrw_check.check ~collect_parse_errors:true ~system_id:test.relative_path reader in
292 let error_msgs = Htmlrw_check.errors result in
293 let warning_msgs = Htmlrw_check.warnings result in
294 let info_msgs = Htmlrw_check.infos result in
295 let errors = List.map (fun m -> m.Htmlrw_check.text) error_msgs in
296 let warnings = List.map (fun m -> m.Htmlrw_check.text) warning_msgs in
297 let infos = List.map (fun m -> m.Htmlrw_check.text) info_msgs in
298 let expected_msg = Validator_messages.get messages test.relative_path in
299
300 let (passed, details, match_quality) = match test.expected with
301 | Valid ->
302 let no_errors = errors = [] && warnings = [] in
303 let details = if no_errors then "OK"
304 else Printf.sprintf "Expected valid but got %d errors, %d warnings" (List.length errors) (List.length warnings) in
305 (no_errors, details, None)
306 | Invalid ->
307 if errors = [] then
308 (false, "Expected error but got none", None)
309 else begin
310 match expected_msg with
311 | None ->
312 (true, Printf.sprintf "Got %d error(s), no expected message" (List.length errors), None)
313 | Some exp ->
314 let expected = Expected_message.parse exp in
315 let qualities = List.map (fun msg -> Expected_message.matches ~strictness ~expected ~actual:msg) error_msgs in
316 let best = List.fold_left (fun b q -> if q < b then q else b) Expected_message.No_match qualities in
317 let acceptable = Expected_message.is_acceptable ~strictness best in
318 let msg = if acceptable then "Message matched" else "Message mismatch" in
319 (acceptable, msg, Some best)
320 end
321 | HasWarning ->
322 (* For haswarn, check warnings AND infos (like test_validator.ml) *)
323 let all_msgs = warning_msgs @ info_msgs in
324 let all_messages = warnings @ infos in
325 if all_messages = [] && errors = [] then
326 (false, "Expected warning but got none", None)
327 else begin
328 match expected_msg with
329 | None ->
330 if all_messages <> [] then
331 (true, Printf.sprintf "Got %d warning/info message(s)" (List.length all_messages), None)
332 else
333 (true, Printf.sprintf "Got %d error(s) instead of warning" (List.length errors), None)
334 | Some exp ->
335 let expected = Expected_message.parse exp in
336 let qualities = List.map (fun msg -> Expected_message.matches ~strictness ~expected ~actual:msg) all_msgs in
337 let best = List.fold_left (fun b q -> if q < b then q else b) Expected_message.No_match qualities in
338 let acceptable = Expected_message.is_acceptable ~strictness best in
339 if acceptable then
340 (true, "Warning/info matched", Some best)
341 else begin
342 (* Also try matching against errors *)
343 let err_qualities = List.map (fun msg -> Expected_message.matches ~strictness ~expected ~actual:msg) error_msgs in
344 let err_best = List.fold_left (fun b q -> if q < b then q else b) Expected_message.No_match err_qualities in
345 let err_acceptable = Expected_message.is_acceptable ~strictness err_best in
346 if err_acceptable then
347 (true, "Error matched (severity differs)", Some err_best)
348 else
349 let final_best = if best < err_best then best else err_best in
350 (false, "Warning mismatch", Some final_best)
351 end
352 end
353 | Unknown -> (false, "Unknown test type", None)
354 in
355 { file = test; passed; actual_errors = errors; actual_warnings = warnings @ infos; details; match_quality }
356 with e ->
357 { file = test; passed = false; actual_errors = []; actual_warnings = [];
358 details = Printf.sprintf "Exception: %s" (Printexc.to_string e); match_quality = None }
359
360 let run_mode ~mode_name ~strictness messages tests =
361 Printf.printf " Running %s mode...\n%!" mode_name;
362 let total = List.length tests in
363 let results = List.mapi (fun i test ->
364 if (i + 1) mod 500 = 0 then Printf.printf " [%d/%d]\n%!" (i + 1) total;
365 run_test ~strictness messages test
366 ) tests in
367 let passed = List.filter (fun r -> r.passed) results |> List.length in
368 Printf.printf " %s: %d/%d passed\n%!" mode_name passed total;
369 (results, passed, total - passed)
370
371 let results_to_file_results mode_name results =
372 (* Group by category *)
373 let by_category = Hashtbl.create 32 in
374 List.iter (fun r ->
375 let cat = r.file.category in
376 let existing = try Hashtbl.find by_category cat with Not_found -> [] in
377 Hashtbl.replace by_category cat (r :: existing)
378 ) results;
379
380 Hashtbl.fold (fun category tests acc ->
381 let tests = List.rev tests in
382 let passed_count = List.filter (fun r -> r.passed) tests |> List.length in
383 let failed_count = List.length tests - passed_count in
384 let test_results = List.mapi (fun i r ->
385 let outcome_str = match r.file.expected with
386 | Valid -> "isvalid" | Invalid -> "novalid" | HasWarning -> "haswarn" | Unknown -> "unknown"
387 in
388 Report.{
389 test_num = i + 1;
390 description = Printf.sprintf "[%s] %s" outcome_str (Filename.basename r.file.relative_path);
391 input = r.file.relative_path;
392 expected = (match r.file.expected with
393 | Valid -> "(no errors)" | Invalid -> "(error expected)" | HasWarning -> "(warning expected)" | Unknown -> "?");
394 actual = String.concat "; " (r.actual_errors @ r.actual_warnings);
395 success = r.passed;
396 details = [("Result", r.details)];
397 raw_test_data = None;
398 }
399 ) tests in
400 Report.{
401 filename = Printf.sprintf "Validator / %s [%s]" category mode_name;
402 test_type = "Validator";
403 passed_count;
404 failed_count;
405 tests = test_results;
406 } :: acc
407 ) by_category []
408
409 let run tests_dir =
410 Printf.printf " Loading validator messages...\n%!";
411 let messages_path = Filename.concat tests_dir "messages.json" in
412 let messages = Validator_messages.load messages_path in
413
414 Printf.printf " Discovering test files...\n%!";
415 let tests = discover_tests_in_dir tests_dir "" in
416 Printf.printf " Found %d test files\n%!" (List.length tests);
417
418 let (lenient_results, lenient_passed, _lenient_failed) =
419 run_mode ~mode_name:"LENIENT" ~strictness:Expected_message.lenient messages tests in
420 let (strict_results, strict_passed, strict_failed) =
421 run_mode ~mode_name:"STRICT" ~strictness:Expected_message.exact_message messages tests in
422
423 let lenient_files = results_to_file_results "Lenient" lenient_results in
424 let strict_files = results_to_file_results "Strict" strict_results in
425
426 let total = List.length tests in
427 {
428 name = "Nu HTML Validator Tests";
429 description = "W3C Nu HTML Validator conformance tests (both lenient and strict modes)";
430 passed = strict_passed; (* Use strict as the primary metric *)
431 failed = strict_failed;
432 files = lenient_files @ strict_files;
433 extra_info = [
434 ("Lenient Mode", Printf.sprintf "%d/%d (%.1f%%)" lenient_passed total
435 (100.0 *. float_of_int lenient_passed /. float_of_int total));
436 ("Strict Mode", Printf.sprintf "%d/%d (%.1f%%)" strict_passed total
437 (100.0 *. float_of_int strict_passed /. float_of_int total));
438 ("Total Tests", string_of_int total);
439 ];
440 }
441end
442
443(* ============================================================ *)
444(* Main Entry Point *)
445(* ============================================================ *)
446
447let get_timestamp () =
448 let now = Unix.gettimeofday () in
449 let tm = Unix.localtime now in
450 Printf.sprintf "%04d-%02d-%02d %02d:%02d:%02d"
451 (tm.Unix.tm_year + 1900) (tm.Unix.tm_mon + 1) tm.Unix.tm_mday
452 tm.Unix.tm_hour tm.Unix.tm_min tm.Unix.tm_sec
453
454let () =
455 let html5lib_dir = ref "html5lib-tests" in
456 let validator_dir = ref "validator/tests" in
457 let output_file = ref "comprehensive_test_report.html" in
458
459 (* Parse args *)
460 let args = Array.to_list Sys.argv |> List.tl in
461 (match args with
462 | [h; v; o] -> html5lib_dir := h; validator_dir := v; output_file := o
463 | [h; v] -> html5lib_dir := h; validator_dir := v
464 | [h] -> html5lib_dir := h
465 | _ -> ());
466
467 Printf.printf "=== Comprehensive HTML5rw Test Suite ===\n\n%!";
468
469 let all_suites = ref [] in
470 let total_passed = ref 0 in
471 let total_failed = ref 0 in
472
473 (* Run HTML5lib tests *)
474 Printf.printf "Running HTML5lib tests from %s...\n%!" !html5lib_dir;
475 if Sys.file_exists !html5lib_dir then begin
476 let suite = Html5lib_runner.run !html5lib_dir in
477 all_suites := suite :: !all_suites;
478 total_passed := !total_passed + suite.passed;
479 total_failed := !total_failed + suite.failed;
480 Printf.printf " Subtotal: %d passed, %d failed\n\n%!" suite.passed suite.failed
481 end else
482 Printf.printf " (directory not found)\n\n%!";
483
484 (* Run Validator tests *)
485 Printf.printf "Running Validator tests from %s...\n%!" !validator_dir;
486 if Sys.file_exists !validator_dir then begin
487 let suite = Validator_runner.run !validator_dir in
488 all_suites := suite :: !all_suites;
489 total_passed := !total_passed + suite.passed;
490 total_failed := !total_failed + suite.failed;
491 Printf.printf " Subtotal: %d passed, %d failed\n\n%!" suite.passed suite.failed
492 end else
493 Printf.printf " (directory not found)\n\n%!";
494
495 Printf.printf "=== Overall Summary ===\n";
496 Printf.printf "Total: %d passed, %d failed\n\n%!" !total_passed !total_failed;
497
498 (* Combine all file results *)
499 let all_files = List.concat_map (fun s -> s.files) (List.rev !all_suites) in
500
501 (* Build description with all suite info as HTML *)
502 let suites_info = List.rev !all_suites |> List.map (fun s ->
503 let extras = String.concat ", " (List.map (fun (k, v) -> Printf.sprintf "%s: %s" k v) s.extra_info) in
504 Printf.sprintf "<li><strong>%s:</strong> %d/%d passed — %s</li>" s.name s.passed (s.passed + s.failed) extras
505 ) |> String.concat "\n" in
506
507 let description = Printf.sprintf
508 "Comprehensive test report for the html5rw OCaml HTML5 parser and validator library.</p>\
509 <p><strong>Test Suites:</strong></p><ul>%s</ul><p>\
510 This report combines results from multiple test suites to provide complete coverage analysis."
511 suites_info
512 in
513
514 let report : Report.report = {
515 title = "html5rw Comprehensive Test Report";
516 test_type = "comprehensive";
517 description;
518 files = all_files;
519 total_passed = !total_passed;
520 total_failed = !total_failed;
521 match_quality = None;
522 test_type_breakdown = None;
523 strictness_mode = Some "Comprehensive (all modes)";
524 run_timestamp = Some (get_timestamp ());
525 } in
526
527 Report.generate_report report !output_file;
528
529 exit (if !total_failed > 0 then 1 else 0)