(* Test runner for html5lib-tests encoding tests *)
module Encoding = Html5rw.Encoding
module Report = Test_report
type test_case = {
input : string;
expected_encoding : string;
raw_lines : string; (* Original test data from .dat file *)
}
(* Normalize encoding name for comparison *)
let normalize_encoding_name s =
String.lowercase_ascii (String.trim s)
(* Convert our encoding type to canonical test name *)
let encoding_to_test_name = function
| Html5rw.Encoding.Utf8 -> "utf-8"
| Html5rw.Encoding.Utf16le -> "utf-16le"
| Html5rw.Encoding.Utf16be -> "utf-16be"
| Html5rw.Encoding.Windows_1252 -> "windows-1252"
| Html5rw.Encoding.Iso_8859_2 -> "iso-8859-2"
| Html5rw.Encoding.Euc_jp -> "euc-jp"
(* Parse a single test case from lines *)
let parse_test_case lines =
let raw_lines = String.concat "\n" lines in
let rec parse acc = function
| [] -> acc
| line :: rest when String.length line > 0 && line.[0] = '#' ->
let section = String.trim line in
let content, remaining = collect_section rest in
parse ((section, content) :: acc) remaining
| _ :: rest -> parse acc rest
and collect_section lines =
let rec loop acc = function
| [] -> (List.rev acc, [])
| line :: rest when String.length line > 0 && line.[0] = '#' ->
(List.rev acc, line :: rest)
| line :: rest -> loop (line :: acc) rest
in
loop [] lines
in
let sections = parse [] lines in
let get_section name =
match List.assoc_opt name sections with
| Some lines -> String.concat "\n" lines
| None -> ""
in
let data = get_section "#data" in
let encoding = get_section "#encoding" in
{ input = data; expected_encoding = String.trim encoding; raw_lines }
(* Parse a .dat file into test cases *)
let parse_dat_file content =
let lines = String.split_on_char '\n' content in
(* Split on empty lines followed by #data *)
let rec split_tests current acc = function
| [] ->
if current = [] then List.rev acc
else List.rev (List.rev current :: acc)
| "" :: "#data" :: rest ->
let new_acc = if current = [] then acc else (List.rev current :: acc) in
split_tests ["#data"] new_acc rest
| line :: rest ->
split_tests (line :: current) acc rest
in
let test_groups = split_tests [] [] lines in
List.filter_map (fun lines ->
if List.exists (fun l -> l = "#data") lines then
Some (parse_test_case lines)
else None
) test_groups
(* Run a single encoding test *)
let run_test test =
try
(* Detect encoding from the input bytes *)
let (_, detected_encoding) = Html5rw.Encoding.decode (Bytes.of_string test.input) () in
let detected_name = encoding_to_test_name detected_encoding in
let expected_name = normalize_encoding_name test.expected_encoding in
(* Compare - allow some flexibility in naming *)
let match_encoding det exp =
det = exp ||
(det = "windows-1252" && (exp = "windows-1252" || exp = "cp1252" || exp = "iso-8859-1")) ||
(det = "iso-8859-2" && (exp = "iso-8859-2" || exp = "iso8859-2" || exp = "latin2")) ||
(det = "utf-8" && (exp = "utf-8" || exp = "utf8")) ||
(det = "euc-jp" && (exp = "euc-jp" || exp = "eucjp"))
in
(match_encoding detected_name expected_name, detected_name, expected_name)
with e ->
(false, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), test.expected_encoding)
(* Run all tests in a file *)
let run_file path =
let ic = open_in path in
let content = really_input_string ic (in_channel_length ic) in
close_in ic;
let tests = parse_dat_file content in
let filename = Filename.basename path in
let passed = ref 0 in
let failed = ref 0 in
let results = ref [] in
List.iteri (fun i test ->
if String.trim test.expected_encoding = "" then
(* Skip tests without expected encoding *)
()
else begin
let (success, detected, expected) = run_test test in
let result : Report.test_result = {
test_num = i + 1;
description = Printf.sprintf "Detect %s encoding" expected;
input = String.escaped test.input; (* Show escaped version of full input *)
expected;
actual = detected;
success;
details = [
("Input Length", string_of_int (String.length test.input));
("Has BOM", string_of_bool (String.length test.input >= 3 &&
(String.sub test.input 0 3 = "\xEF\xBB\xBF" || (* UTF-8 BOM *)
String.sub test.input 0 2 = "\xFF\xFE" || (* UTF-16 LE BOM *)
String.sub test.input 0 2 = "\xFE\xFF"))); (* UTF-16 BE BOM *)
];
raw_test_data = Some test.raw_lines;
} in
results := result :: !results;
if success then incr passed else incr failed
end
) tests;
let file_result : Report.file_result = {
filename;
test_type = "Encoding Detection";
passed_count = !passed;
failed_count = !failed;
tests = List.rev !results;
} in
(file_result, !passed, !failed)
let () =
let test_dir = Sys.argv.(1) in
let files = Sys.readdir test_dir |> Array.to_list in
let dat_files = List.filter (fun f ->
Filename.check_suffix f ".dat" &&
not (String.contains f '/')
) files in
let total_passed = ref 0 in
let total_failed = ref 0 in
let file_results = ref [] in
List.iter (fun file ->
let path = Filename.concat test_dir file in
if Sys.is_directory path then () else begin
let (file_result, passed, failed) = run_file path in
total_passed := !total_passed + passed;
total_failed := !total_failed + failed;
file_results := file_result :: !file_results;
Printf.printf "%s: %d passed, %d failed\n" file passed failed
end
) (List.sort String.compare dat_files);
Printf.printf "\n=== Summary ===\n";
Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed;
(* Generate HTML report *)
let report : Report.report = {
title = "HTML5 Encoding Detection Tests";
test_type = "encoding";
description = "These tests validate the character encoding detection algorithm as specified in the WHATWG \
Encoding Standard. The parser must determine the document's character encoding from byte order \
marks (BOM), meta charset declarations, or content sniffing. Tests cover UTF-8, UTF-16 \
(big/little endian), Windows-1252, ISO-8859-2, EUC-JP, and other encodings. The algorithm \
examines initial bytes for BOM signatures and scans the first 1024 bytes for meta elements \
declaring charset or http-equiv content-type.";
files = List.rev !file_results;
total_passed = !total_passed;
total_failed = !total_failed;
} in
Report.generate_report report "test_encoding_report.html";
exit (if !total_failed > 0 then 1 else 0)