OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 7.0 kB view raw
1(* Test runner for html5lib-tests encoding tests *) 2 3module Encoding = Html5rw.Encoding 4module Report = Test_report 5 6type test_case = { 7 input : string; 8 expected_encoding : string; 9 raw_lines : string; (* Original test data from .dat file *) 10} 11 12(* Normalize encoding name for comparison *) 13let normalize_encoding_name s = 14 String.lowercase_ascii (String.trim s) 15 16(* Convert our encoding type to canonical test name *) 17let encoding_to_test_name = function 18 | Html5rw.Encoding.Utf8 -> "utf-8" 19 | Html5rw.Encoding.Utf16le -> "utf-16le" 20 | Html5rw.Encoding.Utf16be -> "utf-16be" 21 | Html5rw.Encoding.Windows_1252 -> "windows-1252" 22 | Html5rw.Encoding.Iso_8859_2 -> "iso-8859-2" 23 | Html5rw.Encoding.Euc_jp -> "euc-jp" 24 25(* Parse a single test case from lines *) 26let parse_test_case lines = 27 let raw_lines = String.concat "\n" lines in 28 let rec parse acc = function 29 | [] -> acc 30 | line :: rest when String.length line > 0 && line.[0] = '#' -> 31 let section = String.trim line in 32 let content, remaining = collect_section rest in 33 parse ((section, content) :: acc) remaining 34 | _ :: rest -> parse acc rest 35 and collect_section lines = 36 let rec loop acc = function 37 | [] -> (List.rev acc, []) 38 | line :: rest when String.length line > 0 && line.[0] = '#' -> 39 (List.rev acc, line :: rest) 40 | line :: rest -> loop (line :: acc) rest 41 in 42 loop [] lines 43 in 44 let sections = parse [] lines in 45 46 let get_section name = 47 match List.assoc_opt name sections with 48 | Some lines -> String.concat "\n" lines 49 | None -> "" 50 in 51 52 let data = get_section "#data" in 53 let encoding = get_section "#encoding" in 54 55 { input = data; expected_encoding = String.trim encoding; raw_lines } 56 57(* Parse a .dat file into test cases *) 58let parse_dat_file content = 59 let lines = String.split_on_char '\n' content in 60 (* Split on empty lines followed by #data *) 61 let rec split_tests current acc = function 62 | [] -> 63 if current = [] then List.rev acc 64 else List.rev (List.rev current :: acc) 65 | "" :: "#data" :: rest -> 66 let new_acc = if current = [] then acc else (List.rev current :: acc) in 67 split_tests ["#data"] new_acc rest 68 | line :: rest -> 69 split_tests (line :: current) acc rest 70 in 71 let test_groups = split_tests [] [] lines in 72 List.filter_map (fun lines -> 73 if List.exists (fun l -> l = "#data") lines then 74 Some (parse_test_case lines) 75 else None 76 ) test_groups 77 78(* Run a single encoding test *) 79let run_test test = 80 try 81 (* Detect encoding from the input bytes *) 82 let (_, detected_encoding) = Html5rw.Encoding.decode (Bytes.of_string test.input) () in 83 let detected_name = encoding_to_test_name detected_encoding in 84 let expected_name = normalize_encoding_name test.expected_encoding in 85 86 (* Compare - allow some flexibility in naming *) 87 let match_encoding det exp = 88 det = exp || 89 (det = "windows-1252" && (exp = "windows-1252" || exp = "cp1252" || exp = "iso-8859-1")) || 90 (det = "iso-8859-2" && (exp = "iso-8859-2" || exp = "iso8859-2" || exp = "latin2")) || 91 (det = "utf-8" && (exp = "utf-8" || exp = "utf8")) || 92 (det = "euc-jp" && (exp = "euc-jp" || exp = "eucjp")) 93 in 94 95 (match_encoding detected_name expected_name, detected_name, expected_name) 96 with e -> 97 (false, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), test.expected_encoding) 98 99(* Run all tests in a file *) 100let run_file path = 101 let ic = open_in path in 102 let content = really_input_string ic (in_channel_length ic) in 103 close_in ic; 104 105 let tests = parse_dat_file content in 106 let filename = Filename.basename path in 107 108 let passed = ref 0 in 109 let failed = ref 0 in 110 let results = ref [] in 111 112 List.iteri (fun i test -> 113 if String.trim test.expected_encoding = "" then 114 (* Skip tests without expected encoding *) 115 () 116 else begin 117 let (success, detected, expected) = run_test test in 118 let result : Report.test_result = { 119 test_num = i + 1; 120 description = Printf.sprintf "Detect %s encoding" expected; 121 input = String.escaped test.input; (* Show escaped version of full input *) 122 expected; 123 actual = detected; 124 success; 125 details = [ 126 ("Input Length", string_of_int (String.length test.input)); 127 ("Has BOM", string_of_bool (String.length test.input >= 3 && 128 (String.sub test.input 0 3 = "\xEF\xBB\xBF" || (* UTF-8 BOM *) 129 String.sub test.input 0 2 = "\xFF\xFE" || (* UTF-16 LE BOM *) 130 String.sub test.input 0 2 = "\xFE\xFF"))); (* UTF-16 BE BOM *) 131 ]; 132 raw_test_data = Some test.raw_lines; 133 } in 134 results := result :: !results; 135 if success then incr passed else incr failed 136 end 137 ) tests; 138 139 let file_result : Report.file_result = { 140 filename; 141 test_type = "Encoding Detection"; 142 passed_count = !passed; 143 failed_count = !failed; 144 tests = List.rev !results; 145 } in 146 (file_result, !passed, !failed) 147 148let () = 149 let test_dir = Sys.argv.(1) in 150 let files = Sys.readdir test_dir |> Array.to_list in 151 let dat_files = List.filter (fun f -> 152 Filename.check_suffix f ".dat" && 153 not (String.contains f '/') 154 ) files in 155 156 let total_passed = ref 0 in 157 let total_failed = ref 0 in 158 let file_results = ref [] in 159 160 List.iter (fun file -> 161 let path = Filename.concat test_dir file in 162 if Sys.is_directory path then () else begin 163 let (file_result, passed, failed) = run_file path in 164 total_passed := !total_passed + passed; 165 total_failed := !total_failed + failed; 166 file_results := file_result :: !file_results; 167 Printf.printf "%s: %d passed, %d failed\n" file passed failed 168 end 169 ) (List.sort String.compare dat_files); 170 171 Printf.printf "\n=== Summary ===\n"; 172 Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed; 173 174 (* Generate HTML report *) 175 let report : Report.report = { 176 title = "HTML5 Encoding Detection Tests"; 177 test_type = "encoding"; 178 description = "These tests validate the character encoding detection algorithm as specified in the WHATWG \ 179 Encoding Standard. The parser must determine the document's character encoding from byte order \ 180 marks (BOM), meta charset declarations, or content sniffing. Tests cover UTF-8, UTF-16 \ 181 (big/little endian), Windows-1252, ISO-8859-2, EUC-JP, and other encodings. The algorithm \ 182 examines initial bytes for BOM signatures and scans the first 1024 bytes for meta elements \ 183 declaring charset or http-equiv content-type."; 184 files = List.rev !file_results; 185 total_passed = !total_passed; 186 total_failed = !total_failed; 187 match_quality = None; 188 test_type_breakdown = None; 189 strictness_mode = None; 190 run_timestamp = None; 191 } in 192 Report.generate_report report "test_encoding_report.html"; 193 194 exit (if !total_failed > 0 then 1 else 0)