(* Test runner for html5lib-tests encoding tests *) module Encoding = Html5rw.Encoding module Report = Test_report type test_case = { input : string; expected_encoding : string; raw_lines : string; (* Original test data from .dat file *) } (* Normalize encoding name for comparison *) let normalize_encoding_name s = String.lowercase_ascii (String.trim s) (* Convert our encoding type to canonical test name *) let encoding_to_test_name = function | Html5rw.Encoding.Utf8 -> "utf-8" | Html5rw.Encoding.Utf16le -> "utf-16le" | Html5rw.Encoding.Utf16be -> "utf-16be" | Html5rw.Encoding.Windows_1252 -> "windows-1252" | Html5rw.Encoding.Iso_8859_2 -> "iso-8859-2" | Html5rw.Encoding.Euc_jp -> "euc-jp" (* Parse a single test case from lines *) let parse_test_case lines = let raw_lines = String.concat "\n" lines in let rec parse acc = function | [] -> acc | line :: rest when String.length line > 0 && line.[0] = '#' -> let section = String.trim line in let content, remaining = collect_section rest in parse ((section, content) :: acc) remaining | _ :: rest -> parse acc rest and collect_section lines = let rec loop acc = function | [] -> (List.rev acc, []) | line :: rest when String.length line > 0 && line.[0] = '#' -> (List.rev acc, line :: rest) | line :: rest -> loop (line :: acc) rest in loop [] lines in let sections = parse [] lines in let get_section name = match List.assoc_opt name sections with | Some lines -> String.concat "\n" lines | None -> "" in let data = get_section "#data" in let encoding = get_section "#encoding" in { input = data; expected_encoding = String.trim encoding; raw_lines } (* Parse a .dat file into test cases *) let parse_dat_file content = let lines = String.split_on_char '\n' content in (* Split on empty lines followed by #data *) let rec split_tests current acc = function | [] -> if current = [] then List.rev acc else List.rev (List.rev current :: acc) | "" :: "#data" :: rest -> let new_acc = if current = [] then acc else (List.rev current :: acc) in split_tests ["#data"] new_acc rest | line :: rest -> split_tests (line :: current) acc rest in let test_groups = split_tests [] [] lines in List.filter_map (fun lines -> if List.exists (fun l -> l = "#data") lines then Some (parse_test_case lines) else None ) test_groups (* Run a single encoding test *) let run_test test = try (* Detect encoding from the input bytes *) let (_, detected_encoding) = Html5rw.Encoding.decode (Bytes.of_string test.input) () in let detected_name = encoding_to_test_name detected_encoding in let expected_name = normalize_encoding_name test.expected_encoding in (* Compare - allow some flexibility in naming *) let match_encoding det exp = det = exp || (det = "windows-1252" && (exp = "windows-1252" || exp = "cp1252" || exp = "iso-8859-1")) || (det = "iso-8859-2" && (exp = "iso-8859-2" || exp = "iso8859-2" || exp = "latin2")) || (det = "utf-8" && (exp = "utf-8" || exp = "utf8")) || (det = "euc-jp" && (exp = "euc-jp" || exp = "eucjp")) in (match_encoding detected_name expected_name, detected_name, expected_name) with e -> (false, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), test.expected_encoding) (* Run all tests in a file *) let run_file path = let ic = open_in path in let content = really_input_string ic (in_channel_length ic) in close_in ic; let tests = parse_dat_file content in let filename = Filename.basename path in let passed = ref 0 in let failed = ref 0 in let results = ref [] in List.iteri (fun i test -> if String.trim test.expected_encoding = "" then (* Skip tests without expected encoding *) () else begin let (success, detected, expected) = run_test test in let result : Report.test_result = { test_num = i + 1; description = Printf.sprintf "Detect %s encoding" expected; input = String.escaped test.input; (* Show escaped version of full input *) expected; actual = detected; success; details = [ ("Input Length", string_of_int (String.length test.input)); ("Has BOM", string_of_bool (String.length test.input >= 3 && (String.sub test.input 0 3 = "\xEF\xBB\xBF" || (* UTF-8 BOM *) String.sub test.input 0 2 = "\xFF\xFE" || (* UTF-16 LE BOM *) String.sub test.input 0 2 = "\xFE\xFF"))); (* UTF-16 BE BOM *) ]; raw_test_data = Some test.raw_lines; } in results := result :: !results; if success then incr passed else incr failed end ) tests; let file_result : Report.file_result = { filename; test_type = "Encoding Detection"; passed_count = !passed; failed_count = !failed; tests = List.rev !results; } in (file_result, !passed, !failed) let () = let test_dir = Sys.argv.(1) in let files = Sys.readdir test_dir |> Array.to_list in let dat_files = List.filter (fun f -> Filename.check_suffix f ".dat" && not (String.contains f '/') ) files in let total_passed = ref 0 in let total_failed = ref 0 in let file_results = ref [] in List.iter (fun file -> let path = Filename.concat test_dir file in if Sys.is_directory path then () else begin let (file_result, passed, failed) = run_file path in total_passed := !total_passed + passed; total_failed := !total_failed + failed; file_results := file_result :: !file_results; Printf.printf "%s: %d passed, %d failed\n" file passed failed end ) (List.sort String.compare dat_files); Printf.printf "\n=== Summary ===\n"; Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed; (* Generate HTML report *) let report : Report.report = { title = "HTML5 Encoding Detection Tests"; test_type = "encoding"; description = "These tests validate the character encoding detection algorithm as specified in the WHATWG \ Encoding Standard. The parser must determine the document's character encoding from byte order \ marks (BOM), meta charset declarations, or content sniffing. Tests cover UTF-8, UTF-16 \ (big/little endian), Windows-1252, ISO-8859-2, EUC-JP, and other encodings. The algorithm \ examines initial bytes for BOM signatures and scans the first 1024 bytes for meta elements \ declaring charset or http-equiv content-type."; files = List.rev !file_results; total_passed = !total_passed; total_failed = !total_failed; } in Report.generate_report report "test_encoding_report.html"; exit (if !total_failed > 0 then 1 else 0)