OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* Quick analysis: find failing test files and print their content *)
2
3let tests_dir = "validator/tests"
4
5type expected_outcome = Valid | Invalid | HasWarning | Unknown
6
7let parse_outcome filename =
8 (* Check .html *)
9 if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-isvalid.html" then Valid
10 else if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-novalid.html" then Invalid
11 else if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-haswarn.html" then HasWarning
12 (* Check .xhtml *)
13 else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-isvalid.xhtml" then Valid
14 else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-novalid.xhtml" then Invalid
15 else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-haswarn.xhtml" then HasWarning
16 else Unknown
17
18let rec find_files dir =
19 let entries = Sys.readdir dir |> Array.to_list in
20 List.concat_map (fun entry ->
21 let path = Filename.concat dir entry in
22 if Sys.is_directory path then find_files path
23 else if parse_outcome (Filename.basename path) <> Unknown then [path]
24 else []
25 ) entries
26
27let () =
28 let mode = if Array.length Sys.argv > 1 then Sys.argv.(1) else "novalid" in
29 let files = find_files tests_dir in
30 let count = ref 0 in
31
32 List.iter (fun path ->
33 let outcome = parse_outcome (Filename.basename path) in
34 let ic = open_in path in
35 let content = really_input_string ic (in_channel_length ic) in
36 close_in ic;
37
38 let reader = Bytesrw.Bytes.Reader.of_string content in
39 let result = Html5_checker.check ~collect_parse_errors:true reader in
40 let errors = Html5_checker.errors result in
41 let warnings = Html5_checker.warnings result in
42
43 let should_print = match mode with
44 | "isvalid" -> outcome = Valid && (errors <> [] || warnings <> []) && !count < 60
45 | _ -> outcome = Invalid && errors = [] && !count < 60
46 in
47 if should_print then begin
48 Printf.printf "\n=== %s ===\n" path;
49 if mode = "isvalid" then begin
50 if errors <> [] then begin
51 Printf.printf "ERRORS:\n";
52 List.iter (fun e -> Printf.printf " %s\n" e.Html5_checker.Message.message) errors
53 end;
54 if warnings <> [] then begin
55 Printf.printf "WARNINGS:\n";
56 List.iter (fun w -> Printf.printf " %s\n" w.Html5_checker.Message.message) warnings
57 end
58 end;
59 print_endline content;
60 incr count
61 end
62 ) files