test/test_encoding.ml at main · anil.recoil.org/ocaml-html5rw

OCaml HTML5 parser/serialiser based on Python's JustHTML
ocaml-html5rw / test / test_encoding.ml
at main 7.0 kB view raw
  1(* Test runner for html5lib-tests encoding tests *)
  2
  3module Encoding = Html5rw.Encoding
  4module Report = Test_report
  5
  6type test_case = {
  7  input : string;
  8  expected_encoding : string;
  9  raw_lines : string;  (* Original test data from .dat file *)
 10}
 11
 12(* Normalize encoding name for comparison *)
 13let normalize_encoding_name s =
 14  String.lowercase_ascii (String.trim s)
 15
 16(* Convert our encoding type to canonical test name *)
 17let encoding_to_test_name = function
 18  | Html5rw.Encoding.Utf8 -> "utf-8"
 19  | Html5rw.Encoding.Utf16le -> "utf-16le"
 20  | Html5rw.Encoding.Utf16be -> "utf-16be"
 21  | Html5rw.Encoding.Windows_1252 -> "windows-1252"
 22  | Html5rw.Encoding.Iso_8859_2 -> "iso-8859-2"
 23  | Html5rw.Encoding.Euc_jp -> "euc-jp"
 24
 25(* Parse a single test case from lines *)
 26let parse_test_case lines =
 27  let raw_lines = String.concat "\n" lines in
 28  let rec parse acc = function
 29    | [] -> acc
 30    | line :: rest when String.length line > 0 && line.[0] = '#' ->
 31      let section = String.trim line in
 32      let content, remaining = collect_section rest in
 33      parse ((section, content) :: acc) remaining
 34    | _ :: rest -> parse acc rest
 35  and collect_section lines =
 36    let rec loop acc = function
 37      | [] -> (List.rev acc, [])
 38      | line :: rest when String.length line > 0 && line.[0] = '#' ->
 39        (List.rev acc, line :: rest)
 40      | line :: rest -> loop (line :: acc) rest
 41    in
 42    loop [] lines
 43  in
 44  let sections = parse [] lines in
 45
 46  let get_section name =
 47    match List.assoc_opt name sections with
 48    | Some lines -> String.concat "\n" lines
 49    | None -> ""
 50  in
 51
 52  let data = get_section "#data" in
 53  let encoding = get_section "#encoding" in
 54
 55  { input = data; expected_encoding = String.trim encoding; raw_lines }
 56
 57(* Parse a .dat file into test cases *)
 58let parse_dat_file content =
 59  let lines = String.split_on_char '\n' content in
 60  (* Split on empty lines followed by #data *)
 61  let rec split_tests current acc = function
 62    | [] ->
 63      if current = [] then List.rev acc
 64      else List.rev (List.rev current :: acc)
 65    | "" :: "#data" :: rest ->
 66      let new_acc = if current = [] then acc else (List.rev current :: acc) in
 67      split_tests ["#data"] new_acc rest
 68    | line :: rest ->
 69      split_tests (line :: current) acc rest
 70  in
 71  let test_groups = split_tests [] [] lines in
 72  List.filter_map (fun lines ->
 73    if List.exists (fun l -> l = "#data") lines then
 74      Some (parse_test_case lines)
 75    else None
 76  ) test_groups
 77
 78(* Run a single encoding test *)
 79let run_test test =
 80  try
 81    (* Detect encoding from the input bytes *)
 82    let (_, detected_encoding) = Html5rw.Encoding.decode (Bytes.of_string test.input) () in
 83    let detected_name = encoding_to_test_name detected_encoding in
 84    let expected_name = normalize_encoding_name test.expected_encoding in
 85
 86    (* Compare - allow some flexibility in naming *)
 87    let match_encoding det exp =
 88      det = exp ||
 89      (det = "windows-1252" && (exp = "windows-1252" || exp = "cp1252" || exp = "iso-8859-1")) ||
 90      (det = "iso-8859-2" && (exp = "iso-8859-2" || exp = "iso8859-2" || exp = "latin2")) ||
 91      (det = "utf-8" && (exp = "utf-8" || exp = "utf8")) ||
 92      (det = "euc-jp" && (exp = "euc-jp" || exp = "eucjp"))
 93    in
 94
 95    (match_encoding detected_name expected_name, detected_name, expected_name)
 96  with e ->
 97    (false, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), test.expected_encoding)
 98
 99(* Run all tests in a file *)
100let run_file path =
101  let ic = open_in path in
102  let content = really_input_string ic (in_channel_length ic) in
103  close_in ic;
104
105  let tests = parse_dat_file content in
106  let filename = Filename.basename path in
107
108  let passed = ref 0 in
109  let failed = ref 0 in
110  let results = ref [] in
111
112  List.iteri (fun i test ->
113    if String.trim test.expected_encoding = "" then
114      (* Skip tests without expected encoding *)
115      ()
116    else begin
117      let (success, detected, expected) = run_test test in
118      let result : Report.test_result = {
119        test_num = i + 1;
120        description = Printf.sprintf "Detect %s encoding" expected;
121        input = String.escaped test.input;  (* Show escaped version of full input *)
122        expected;
123        actual = detected;
124        success;
125        details = [
126          ("Input Length", string_of_int (String.length test.input));
127          ("Has BOM", string_of_bool (String.length test.input >= 3 &&
128            (String.sub test.input 0 3 = "\xEF\xBB\xBF" ||  (* UTF-8 BOM *)
129             String.sub test.input 0 2 = "\xFF\xFE" ||      (* UTF-16 LE BOM *)
130             String.sub test.input 0 2 = "\xFE\xFF")));     (* UTF-16 BE BOM *)
131        ];
132        raw_test_data = Some test.raw_lines;
133      } in
134      results := result :: !results;
135      if success then incr passed else incr failed
136    end
137  ) tests;
138
139  let file_result : Report.file_result = {
140    filename;
141    test_type = "Encoding Detection";
142    passed_count = !passed;
143    failed_count = !failed;
144    tests = List.rev !results;
145  } in
146  (file_result, !passed, !failed)
147
148let () =
149  let test_dir = Sys.argv.(1) in
150  let files = Sys.readdir test_dir |> Array.to_list in
151  let dat_files = List.filter (fun f ->
152    Filename.check_suffix f ".dat" &&
153    not (String.contains f '/')
154  ) files in
155
156  let total_passed = ref 0 in
157  let total_failed = ref 0 in
158  let file_results = ref [] in
159
160  List.iter (fun file ->
161    let path = Filename.concat test_dir file in
162    if Sys.is_directory path then () else begin
163      let (file_result, passed, failed) = run_file path in
164      total_passed := !total_passed + passed;
165      total_failed := !total_failed + failed;
166      file_results := file_result :: !file_results;
167      Printf.printf "%s: %d passed, %d failed\n" file passed failed
168    end
169  ) (List.sort String.compare dat_files);
170
171  Printf.printf "\n=== Summary ===\n";
172  Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed;
173
174  (* Generate HTML report *)
175  let report : Report.report = {
176    title = "HTML5 Encoding Detection Tests";
177    test_type = "encoding";
178    description = "These tests validate the character encoding detection algorithm as specified in the WHATWG \
179                   Encoding Standard. The parser must determine the document's character encoding from byte order \
180                   marks (BOM), meta charset declarations, or content sniffing. Tests cover UTF-8, UTF-16 \
181                   (big/little endian), Windows-1252, ISO-8859-2, EUC-JP, and other encodings. The algorithm \
182                   examines initial bytes for BOM signatures and scans the first 1024 bytes for meta elements \
183                   declaring charset or http-equiv content-type.";
184    files = List.rev !file_results;
185    total_passed = !total_passed;
186    total_failed = !total_failed;
187    match_quality = None;
188    test_type_breakdown = None;
189    strictness_mode = None;
190    run_timestamp = None;
191  } in
192  Report.generate_report report "test_encoding_report.html";
193
194  exit (if !total_failed > 0 then 1 else 0)