OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* Quick analysis: find failing test files and print their content *) 2 3let tests_dir = "validator/tests" 4 5type expected_outcome = Valid | Invalid | HasWarning | Unknown 6 7let parse_outcome filename = 8 (* Check .html *) 9 if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-isvalid.html" then Valid 10 else if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-novalid.html" then Invalid 11 else if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-haswarn.html" then HasWarning 12 (* Check .xhtml *) 13 else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-isvalid.xhtml" then Valid 14 else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-novalid.xhtml" then Invalid 15 else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-haswarn.xhtml" then HasWarning 16 else Unknown 17 18let rec find_files dir = 19 let entries = Sys.readdir dir |> Array.to_list in 20 List.concat_map (fun entry -> 21 let path = Filename.concat dir entry in 22 if Sys.is_directory path then find_files path 23 else if parse_outcome (Filename.basename path) <> Unknown then [path] 24 else [] 25 ) entries 26 27let () = 28 let mode = if Array.length Sys.argv > 1 then Sys.argv.(1) else "novalid" in 29 let files = find_files tests_dir in 30 let count = ref 0 in 31 32 List.iter (fun path -> 33 let outcome = parse_outcome (Filename.basename path) in 34 let ic = open_in path in 35 let content = really_input_string ic (in_channel_length ic) in 36 close_in ic; 37 38 let reader = Bytesrw.Bytes.Reader.of_string content in 39 let result = Html5_checker.check ~collect_parse_errors:true reader in 40 let errors = Html5_checker.errors result in 41 let warnings = Html5_checker.warnings result in 42 43 let should_print = match mode with 44 | "isvalid" -> outcome = Valid && (errors <> [] || warnings <> []) && !count < 60 45 | _ -> outcome = Invalid && errors = [] && !count < 60 46 in 47 if should_print then begin 48 Printf.printf "\n=== %s ===\n" path; 49 if mode = "isvalid" then begin 50 if errors <> [] then begin 51 Printf.printf "ERRORS:\n"; 52 List.iter (fun e -> Printf.printf " %s\n" e.Html5_checker.Message.message) errors 53 end; 54 if warnings <> [] then begin 55 Printf.printf "WARNINGS:\n"; 56 List.iter (fun w -> Printf.printf " %s\n" w.Html5_checker.Message.message) warnings 57 end 58 end; 59 print_endline content; 60 incr count 61 end 62 ) files