(* Test runner for html5lib-tests tokenizer tests *) open Bytesrw module Tokenizer = Html5rw.Tokenizer module Report = Test_report (* Token collector sink - collects all tokens into a list *) module TokenCollector = struct type t = { mutable tokens : Html5rw.Tokenizer.Token.t list; } let create () = { tokens = [] } let process t token ~line:_ ~column:_ = t.tokens <- token :: t.tokens; `Continue let adjusted_current_node_in_html_namespace _ = true let get_tokens t = List.rev t.tokens end (* Test case representation *) type test_case = { description : string; input : string; output : Jsont.json list; expected_error_count : int; initial_states : string list; last_start_tag : string option; double_escaped : bool; xml_mode : bool; raw_json : string; (* Original JSON representation of this test *) } (* Unescape double-escaped strings from tests *) let unescape_double s = let b = Buffer.create (String.length s) in let i = ref 0 in while !i < String.length s do if !i + 1 < String.length s && s.[!i] = '\\' then begin match s.[!i + 1] with | 'u' when !i + 5 < String.length s -> let hex = String.sub s (!i + 2) 4 in (try let code = int_of_string ("0x" ^ hex) in if code < 128 then Buffer.add_char b (Char.chr code) else begin (* UTF-8 encode *) if code < 0x800 then begin Buffer.add_char b (Char.chr (0xC0 lor (code lsr 6))); Buffer.add_char b (Char.chr (0x80 lor (code land 0x3F))) end else begin Buffer.add_char b (Char.chr (0xE0 lor (code lsr 12))); Buffer.add_char b (Char.chr (0x80 lor ((code lsr 6) land 0x3F))); Buffer.add_char b (Char.chr (0x80 lor (code land 0x3F))) end end; i := !i + 6 with _ -> Buffer.add_char b s.[!i]; incr i) | 'n' -> Buffer.add_char b '\n'; i := !i + 2 | 'r' -> Buffer.add_char b '\r'; i := !i + 2 | 't' -> Buffer.add_char b '\t'; i := !i + 2 | '\\' -> Buffer.add_char b '\\'; i := !i + 2 | _ -> Buffer.add_char b s.[!i]; incr i end else begin Buffer.add_char b s.[!i]; incr i end done; Buffer.contents b (* Extract string from JSON node *) let json_string = function | Jsont.String (s, _) -> s | _ -> failwith "Expected string" let json_bool = function | Jsont.Bool (b, _) -> b | _ -> failwith "Expected bool" let json_array = function | Jsont.Array (arr, _) -> arr | _ -> failwith "Expected array" let json_object = function | Jsont.Object (obj, _) -> obj | _ -> failwith "Expected object" let json_mem name obj = match List.find_opt (fun ((n, _), _) -> n = name) obj with | Some (_, v) -> Some v | None -> None let json_mem_exn name obj = match json_mem name obj with | Some v -> v | None -> failwith ("Missing member: " ^ name) (* Format JSON for display *) let rec json_to_string = function | Jsont.Null _ -> "null" | Jsont.Bool (b, _) -> string_of_bool b | Jsont.Number (n, _) -> Printf.sprintf "%g" n | Jsont.String (s, _) -> Printf.sprintf "%S" s | Jsont.Array (arr, _) -> "[" ^ String.concat ", " (List.map json_to_string arr) ^ "]" | Jsont.Object (obj, _) -> "{" ^ String.concat ", " (List.map (fun ((n, _), v) -> Printf.sprintf "%S: %s" n (json_to_string v)) obj) ^ "}" (* Parse a single test case from JSON *) let parse_test_case ~xml_mode json = let raw_json = json_to_string json in let obj = json_object json in let description = json_string (json_mem_exn "description" obj) in let input = json_string (json_mem_exn "input" obj) in let output = json_array (json_mem_exn "output" obj) in let expected_error_count = match json_mem "errors" obj with | Some e -> List.length (json_array e) | None -> 0 in let initial_states = match json_mem "initialStates" obj with | Some s -> List.map json_string (json_array s) | None -> ["Data state"] in let last_start_tag = match json_mem "lastStartTag" obj with | Some s -> Some (json_string s) | None -> None in let double_escaped = match json_mem "doubleEscaped" obj with | Some b -> json_bool b | None -> false in { description; input; output; expected_error_count; initial_states; last_start_tag; double_escaped; xml_mode; raw_json } (* Convert state name to State.t *) let state_of_string = function | "Data state" -> Html5rw.Tokenizer.State.Data | "PLAINTEXT state" -> Html5rw.Tokenizer.State.Plaintext | "RCDATA state" -> Html5rw.Tokenizer.State.Rcdata | "RAWTEXT state" -> Html5rw.Tokenizer.State.Rawtext | "Script data state" -> Html5rw.Tokenizer.State.Script_data | "CDATA section state" -> Html5rw.Tokenizer.State.Cdata_section | s -> failwith ("Unknown state: " ^ s) (* Convert our token to test format for comparison *) let token_to_test_json (tok : Html5rw.Tokenizer.Token.t) : Jsont.json list = let str s = Jsont.String (s, Jsont.Meta.none) in let arr l = Jsont.Array (l, Jsont.Meta.none) in match tok with | Html5rw.Tokenizer.Token.Doctype { name; public_id; system_id; force_quirks } -> let name_json = match name with | Some n -> str n | None -> Jsont.Null ((), Jsont.Meta.none) in let public_json = match public_id with | Some p -> str p | None -> Jsont.Null ((), Jsont.Meta.none) in let system_json = match system_id with | Some s -> str s | None -> Jsont.Null ((), Jsont.Meta.none) in let correctness = Jsont.Bool (not force_quirks, Jsont.Meta.none) in [arr [str "DOCTYPE"; name_json; public_json; system_json; correctness]] | Html5rw.Tokenizer.Token.Tag { kind = Start; name; attrs; self_closing } -> let attrs_obj = Jsont.Object ( List.map (fun (n, v) -> ((n, Jsont.Meta.none), str v)) (List.rev attrs), Jsont.Meta.none ) in if self_closing then [arr [str "StartTag"; str name; attrs_obj; Jsont.Bool (true, Jsont.Meta.none)]] else [arr [str "StartTag"; str name; attrs_obj]] | Html5rw.Tokenizer.Token.Tag { kind = End; name; _ } -> [arr [str "EndTag"; str name]] | Html5rw.Tokenizer.Token.Comment data -> [arr [str "Comment"; str data]] | Html5rw.Tokenizer.Token.Character data -> (* Split into individual characters for comparison - but actually the tests expect consecutive characters to be merged *) [arr [str "Character"; str data]] | Html5rw.Tokenizer.Token.EOF -> [] (* Compare JSON values for equality *) let rec json_equal a b = match a, b with | Jsont.Null _, Jsont.Null _ -> true | Jsont.Bool (a, _), Jsont.Bool (b, _) -> a = b | Jsont.Number (a, _), Jsont.Number (b, _) -> a = b | Jsont.String (a, _), Jsont.String (b, _) -> a = b | Jsont.Array (a, _), Jsont.Array (b, _) -> List.length a = List.length b && List.for_all2 json_equal a b | Jsont.Object (a, _), Jsont.Object (b, _) -> let a_sorted = List.sort (fun ((n1, _), _) ((n2, _), _) -> String.compare n1 n2) a in let b_sorted = List.sort (fun ((n1, _), _) ((n2, _), _) -> String.compare n1 n2) b in List.length a_sorted = List.length b_sorted && List.for_all2 (fun ((n1, _), v1) ((n2, _), v2) -> n1 = n2 && json_equal v1 v2) a_sorted b_sorted | _ -> false (* Merge consecutive Character tokens *) let merge_character_tokens tokens = let rec loop acc = function | [] -> List.rev acc | Html5rw.Tokenizer.Token.Character s1 :: Html5rw.Tokenizer.Token.Character s2 :: rest -> loop acc (Html5rw.Tokenizer.Token.Character (s1 ^ s2) :: rest) | tok :: rest -> loop (tok :: acc) rest in loop [] tokens (* Run a single test *) let run_test test initial_state = let input = if test.double_escaped then unescape_double test.input else test.input in let collector = TokenCollector.create () in let tokenizer = Html5rw.Tokenizer.create (module TokenCollector) collector ~collect_errors:true ~xml_mode:test.xml_mode () in (* Set initial state *) Html5rw.Tokenizer.set_state tokenizer initial_state; (* Set last start tag if specified *) (match test.last_start_tag with | Some tag -> Html5rw.Tokenizer.set_last_start_tag tokenizer tag | None -> ()); (* Run tokenizer *) let reader = Bytes.Reader.of_string input in Html5rw.Tokenizer.run tokenizer (module TokenCollector) reader; (* Get results *) let tokens = merge_character_tokens (TokenCollector.get_tokens collector) in let actual_tokens = List.concat_map token_to_test_json tokens in (* Unescape expected output if double_escaped *) let expected_output = if test.double_escaped then let rec unescape_json = function | Jsont.String (s, m) -> Jsont.String (unescape_double s, m) | Jsont.Array (arr, m) -> Jsont.Array (List.map unescape_json arr, m) | Jsont.Object (obj, m) -> Jsont.Object (List.map (fun (n, v) -> (n, unescape_json v)) obj, m) | other -> other in List.map unescape_json test.output else test.output in (* Merge consecutive Character tokens in expected output too *) let rec merge_expected = function | [] -> [] | [x] -> [x] | Jsont.Array ([Jsont.String ("Character", _); Jsont.String (s1, m1)], am1) :: Jsont.Array ([Jsont.String ("Character", _); Jsont.String (s2, _)], _) :: rest -> merge_expected (Jsont.Array ([Jsont.String ("Character", Jsont.Meta.none); Jsont.String (s1 ^ s2, m1)], am1) :: rest) | x :: rest -> x :: merge_expected rest in let expected = merge_expected expected_output in (* Compare *) let tokens_match = List.length actual_tokens = List.length expected && List.for_all2 json_equal actual_tokens expected in let actual_error_count = List.length (Html5rw.Tokenizer.get_errors tokenizer) in let errors_count_match = actual_error_count = test.expected_error_count in (tokens_match && errors_count_match, actual_tokens, expected, actual_error_count, test.expected_error_count) (* Run all tests in a file *) let run_file path = let content = let ic = open_in path in let n = in_channel_length ic in let s = really_input_string ic n in close_in ic; s in (* Parse JSON *) let json = match Jsont_bytesrw.decode_string Jsont.json content with | Ok j -> j | Error e -> failwith (Printf.sprintf "JSON parse error in %s: %s" path e) in let obj = json_object json in (* Handle both {"tests": [...]} and {"xmlViolationTests": [...]} formats *) let regular_tests = match json_mem "tests" obj with | Some t -> List.map (parse_test_case ~xml_mode:false) (json_array t) | None -> [] in let xml_tests = match json_mem "xmlViolationTests" obj with | Some t -> List.map (parse_test_case ~xml_mode:true) (json_array t) | None -> [] in let all_tests = regular_tests @ xml_tests in let filename = Filename.basename path in let passed = ref 0 in let failed = ref 0 in let results = ref [] in List.iteri (fun i test -> (* Run for each initial state *) List.iter (fun state_name -> try let state = state_of_string state_name in let (success, actual, expected, actual_err_count, expected_err_count) = run_test test state in let description = Printf.sprintf "[%s] %s" state_name test.description in let result : Report.test_result = { test_num = i + 1; description; input = test.input; expected = String.concat "\n" (List.map json_to_string expected); actual = String.concat "\n" (List.map json_to_string actual); success; details = [ ("Initial State", state_name); ("Last Start Tag", Option.value test.last_start_tag ~default:"(none)"); ("Double Escaped", string_of_bool test.double_escaped); ("XML Mode", string_of_bool test.xml_mode); ("Expected Errors", string_of_int expected_err_count); ("Actual Errors", string_of_int actual_err_count); ]; raw_test_data = Some test.raw_json; } in results := result :: !results; if success then incr passed else incr failed with e -> incr failed; let result : Report.test_result = { test_num = i + 1; description = Printf.sprintf "[%s] %s" state_name test.description; input = test.input; expected = ""; actual = Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e); success = false; details = []; raw_test_data = Some test.raw_json; } in results := result :: !results; Printf.eprintf "Exception in test %d (%s): %s\n" (i + 1) test.description (Printexc.to_string e) ) test.initial_states ) all_tests; let file_result : Report.file_result = { filename; test_type = "Tokenizer"; passed_count = !passed; failed_count = !failed; tests = List.rev !results; } in (file_result, !passed, !failed) let () = let test_dir = Sys.argv.(1) in let files = Sys.readdir test_dir |> Array.to_list in let test_files = List.filter (fun f -> Filename.check_suffix f ".test") files in let total_passed = ref 0 in let total_failed = ref 0 in let file_results = ref [] in List.iter (fun file -> let path = Filename.concat test_dir file in let (file_result, passed, failed) = run_file path in total_passed := !total_passed + passed; total_failed := !total_failed + failed; file_results := file_result :: !file_results; Printf.printf "%s: %d passed, %d failed\n" file passed failed ) (List.sort String.compare test_files); Printf.printf "\n=== Summary ===\n"; Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed; (* Generate HTML report *) let report : Report.report = { title = "HTML5 Tokenizer Tests"; test_type = "tokenizer"; description = "These tests validate the HTML5 tokenization algorithm as specified in the WHATWG HTML Standard. \ The tokenizer converts HTML input into a stream of tokens (DOCTYPE, start tags, end tags, comments, \ character data, and EOF). Each test specifies input HTML, expected tokens in JSON array format, \ and the initial tokenizer state. Tests cover normal parsing, RCDATA/RAWTEXT/PLAINTEXT states, \ script data parsing, CDATA sections, and various error conditions. Some tests are double-escaped \ to represent special characters. XML violation tests check behavior differences from XML mode."; files = List.rev !file_results; total_passed = !total_passed; total_failed = !total_failed; } in Report.generate_report report "test_tokenizer_report.html"; exit (if !total_failed > 0 then 1 else 0)