OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* Test runner for html5lib-tests encoding tests *)
2
3module Encoding = Html5rw.Encoding
4module Report = Test_report
5
6type test_case = {
7 input : string;
8 expected_encoding : string;
9 raw_lines : string; (* Original test data from .dat file *)
10}
11
12(* Normalize encoding name for comparison *)
13let normalize_encoding_name s =
14 String.lowercase_ascii (String.trim s)
15
16(* Convert our encoding type to canonical test name *)
17let encoding_to_test_name = function
18 | Html5rw.Encoding.Utf8 -> "utf-8"
19 | Html5rw.Encoding.Utf16le -> "utf-16le"
20 | Html5rw.Encoding.Utf16be -> "utf-16be"
21 | Html5rw.Encoding.Windows_1252 -> "windows-1252"
22 | Html5rw.Encoding.Iso_8859_2 -> "iso-8859-2"
23 | Html5rw.Encoding.Euc_jp -> "euc-jp"
24
25(* Parse a single test case from lines *)
26let parse_test_case lines =
27 let raw_lines = String.concat "\n" lines in
28 let rec parse acc = function
29 | [] -> acc
30 | line :: rest when String.length line > 0 && line.[0] = '#' ->
31 let section = String.trim line in
32 let content, remaining = collect_section rest in
33 parse ((section, content) :: acc) remaining
34 | _ :: rest -> parse acc rest
35 and collect_section lines =
36 let rec loop acc = function
37 | [] -> (List.rev acc, [])
38 | line :: rest when String.length line > 0 && line.[0] = '#' ->
39 (List.rev acc, line :: rest)
40 | line :: rest -> loop (line :: acc) rest
41 in
42 loop [] lines
43 in
44 let sections = parse [] lines in
45
46 let get_section name =
47 match List.assoc_opt name sections with
48 | Some lines -> String.concat "\n" lines
49 | None -> ""
50 in
51
52 let data = get_section "#data" in
53 let encoding = get_section "#encoding" in
54
55 { input = data; expected_encoding = String.trim encoding; raw_lines }
56
57(* Parse a .dat file into test cases *)
58let parse_dat_file content =
59 let lines = String.split_on_char '\n' content in
60 (* Split on empty lines followed by #data *)
61 let rec split_tests current acc = function
62 | [] ->
63 if current = [] then List.rev acc
64 else List.rev (List.rev current :: acc)
65 | "" :: "#data" :: rest ->
66 let new_acc = if current = [] then acc else (List.rev current :: acc) in
67 split_tests ["#data"] new_acc rest
68 | line :: rest ->
69 split_tests (line :: current) acc rest
70 in
71 let test_groups = split_tests [] [] lines in
72 List.filter_map (fun lines ->
73 if List.exists (fun l -> l = "#data") lines then
74 Some (parse_test_case lines)
75 else None
76 ) test_groups
77
78(* Run a single encoding test *)
79let run_test test =
80 try
81 (* Detect encoding from the input bytes *)
82 let (_, detected_encoding) = Html5rw.Encoding.decode (Bytes.of_string test.input) () in
83 let detected_name = encoding_to_test_name detected_encoding in
84 let expected_name = normalize_encoding_name test.expected_encoding in
85
86 (* Compare - allow some flexibility in naming *)
87 let match_encoding det exp =
88 det = exp ||
89 (det = "windows-1252" && (exp = "windows-1252" || exp = "cp1252" || exp = "iso-8859-1")) ||
90 (det = "iso-8859-2" && (exp = "iso-8859-2" || exp = "iso8859-2" || exp = "latin2")) ||
91 (det = "utf-8" && (exp = "utf-8" || exp = "utf8")) ||
92 (det = "euc-jp" && (exp = "euc-jp" || exp = "eucjp"))
93 in
94
95 (match_encoding detected_name expected_name, detected_name, expected_name)
96 with e ->
97 (false, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), test.expected_encoding)
98
99(* Run all tests in a file *)
100let run_file path =
101 let ic = open_in path in
102 let content = really_input_string ic (in_channel_length ic) in
103 close_in ic;
104
105 let tests = parse_dat_file content in
106 let filename = Filename.basename path in
107
108 let passed = ref 0 in
109 let failed = ref 0 in
110 let results = ref [] in
111
112 List.iteri (fun i test ->
113 if String.trim test.expected_encoding = "" then
114 (* Skip tests without expected encoding *)
115 ()
116 else begin
117 let (success, detected, expected) = run_test test in
118 let result : Report.test_result = {
119 test_num = i + 1;
120 description = Printf.sprintf "Detect %s encoding" expected;
121 input = String.escaped test.input; (* Show escaped version of full input *)
122 expected;
123 actual = detected;
124 success;
125 details = [
126 ("Input Length", string_of_int (String.length test.input));
127 ("Has BOM", string_of_bool (String.length test.input >= 3 &&
128 (String.sub test.input 0 3 = "\xEF\xBB\xBF" || (* UTF-8 BOM *)
129 String.sub test.input 0 2 = "\xFF\xFE" || (* UTF-16 LE BOM *)
130 String.sub test.input 0 2 = "\xFE\xFF"))); (* UTF-16 BE BOM *)
131 ];
132 raw_test_data = Some test.raw_lines;
133 } in
134 results := result :: !results;
135 if success then incr passed else incr failed
136 end
137 ) tests;
138
139 let file_result : Report.file_result = {
140 filename;
141 test_type = "Encoding Detection";
142 passed_count = !passed;
143 failed_count = !failed;
144 tests = List.rev !results;
145 } in
146 (file_result, !passed, !failed)
147
148let () =
149 let test_dir = Sys.argv.(1) in
150 let files = Sys.readdir test_dir |> Array.to_list in
151 let dat_files = List.filter (fun f ->
152 Filename.check_suffix f ".dat" &&
153 not (String.contains f '/')
154 ) files in
155
156 let total_passed = ref 0 in
157 let total_failed = ref 0 in
158 let file_results = ref [] in
159
160 List.iter (fun file ->
161 let path = Filename.concat test_dir file in
162 if Sys.is_directory path then () else begin
163 let (file_result, passed, failed) = run_file path in
164 total_passed := !total_passed + passed;
165 total_failed := !total_failed + failed;
166 file_results := file_result :: !file_results;
167 Printf.printf "%s: %d passed, %d failed\n" file passed failed
168 end
169 ) (List.sort String.compare dat_files);
170
171 Printf.printf "\n=== Summary ===\n";
172 Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed;
173
174 (* Generate HTML report *)
175 let report : Report.report = {
176 title = "HTML5 Encoding Detection Tests";
177 test_type = "encoding";
178 description = "These tests validate the character encoding detection algorithm as specified in the WHATWG \
179 Encoding Standard. The parser must determine the document's character encoding from byte order \
180 marks (BOM), meta charset declarations, or content sniffing. Tests cover UTF-8, UTF-16 \
181 (big/little endian), Windows-1252, ISO-8859-2, EUC-JP, and other encodings. The algorithm \
182 examines initial bytes for BOM signatures and scans the first 1024 bytes for meta elements \
183 declaring charset or http-equiv content-type.";
184 files = List.rev !file_results;
185 total_passed = !total_passed;
186 total_failed = !total_failed;
187 match_quality = None;
188 test_type_breakdown = None;
189 strictness_mode = None;
190 run_timestamp = None;
191 } in
192 Report.generate_report report "test_encoding_report.html";
193
194 exit (if !total_failed > 0 then 1 else 0)