(* Test runner for html5lib-tests tokenizer tests *)
open Bytesrw
module Tokenizer = Html5rw.Tokenizer
module Report = Test_report
(* Token collector sink - collects all tokens into a list *)
module TokenCollector = struct
type t = {
mutable tokens : Html5rw.Tokenizer.Token.t list;
}
let create () = { tokens = [] }
let process t token ~line:_ ~column:_ =
t.tokens <- token :: t.tokens;
`Continue
let adjusted_current_node_in_html_namespace _ = true
let get_tokens t = List.rev t.tokens
end
(* Test case representation *)
type test_case = {
description : string;
input : string;
output : Jsont.json list;
expected_error_count : int;
initial_states : string list;
last_start_tag : string option;
double_escaped : bool;
xml_mode : bool;
raw_json : string; (* Original JSON representation of this test *)
}
(* Unescape double-escaped strings from tests *)
let unescape_double s =
let b = Buffer.create (String.length s) in
let i = ref 0 in
while !i < String.length s do
if !i + 1 < String.length s && s.[!i] = '\\' then begin
match s.[!i + 1] with
| 'u' when !i + 5 < String.length s ->
let hex = String.sub s (!i + 2) 4 in
(try
let code = int_of_string ("0x" ^ hex) in
if code < 128 then Buffer.add_char b (Char.chr code)
else begin
(* UTF-8 encode *)
if code < 0x800 then begin
Buffer.add_char b (Char.chr (0xC0 lor (code lsr 6)));
Buffer.add_char b (Char.chr (0x80 lor (code land 0x3F)))
end else begin
Buffer.add_char b (Char.chr (0xE0 lor (code lsr 12)));
Buffer.add_char b (Char.chr (0x80 lor ((code lsr 6) land 0x3F)));
Buffer.add_char b (Char.chr (0x80 lor (code land 0x3F)))
end
end;
i := !i + 6
with _ ->
Buffer.add_char b s.[!i];
incr i)
| 'n' -> Buffer.add_char b '\n'; i := !i + 2
| 'r' -> Buffer.add_char b '\r'; i := !i + 2
| 't' -> Buffer.add_char b '\t'; i := !i + 2
| '\\' -> Buffer.add_char b '\\'; i := !i + 2
| _ -> Buffer.add_char b s.[!i]; incr i
end else begin
Buffer.add_char b s.[!i];
incr i
end
done;
Buffer.contents b
(* Extract string from JSON node *)
let json_string = function
| Jsont.String (s, _) -> s
| _ -> failwith "Expected string"
let json_bool = function
| Jsont.Bool (b, _) -> b
| _ -> failwith "Expected bool"
let json_array = function
| Jsont.Array (arr, _) -> arr
| _ -> failwith "Expected array"
let json_object = function
| Jsont.Object (obj, _) -> obj
| _ -> failwith "Expected object"
let json_mem name obj =
match List.find_opt (fun ((n, _), _) -> n = name) obj with
| Some (_, v) -> Some v
| None -> None
let json_mem_exn name obj =
match json_mem name obj with
| Some v -> v
| None -> failwith ("Missing member: " ^ name)
(* Format JSON for display *)
let rec json_to_string = function
| Jsont.Null _ -> "null"
| Jsont.Bool (b, _) -> string_of_bool b
| Jsont.Number (n, _) -> Printf.sprintf "%g" n
| Jsont.String (s, _) -> Printf.sprintf "%S" s
| Jsont.Array (arr, _) ->
"[" ^ String.concat ", " (List.map json_to_string arr) ^ "]"
| Jsont.Object (obj, _) ->
"{" ^ String.concat ", " (List.map (fun ((n, _), v) -> Printf.sprintf "%S: %s" n (json_to_string v)) obj) ^ "}"
(* Parse a single test case from JSON *)
let parse_test_case ~xml_mode json =
let raw_json = json_to_string json in
let obj = json_object json in
let description = json_string (json_mem_exn "description" obj) in
let input = json_string (json_mem_exn "input" obj) in
let output = json_array (json_mem_exn "output" obj) in
let expected_error_count = match json_mem "errors" obj with
| Some e -> List.length (json_array e)
| None -> 0
in
let initial_states = match json_mem "initialStates" obj with
| Some s -> List.map json_string (json_array s)
| None -> ["Data state"]
in
let last_start_tag = match json_mem "lastStartTag" obj with
| Some s -> Some (json_string s)
| None -> None
in
let double_escaped = match json_mem "doubleEscaped" obj with
| Some b -> json_bool b
| None -> false
in
{ description; input; output; expected_error_count; initial_states; last_start_tag; double_escaped; xml_mode; raw_json }
(* Convert state name to State.t *)
let state_of_string = function
| "Data state" -> Html5rw.Tokenizer.State.Data
| "PLAINTEXT state" -> Html5rw.Tokenizer.State.Plaintext
| "RCDATA state" -> Html5rw.Tokenizer.State.Rcdata
| "RAWTEXT state" -> Html5rw.Tokenizer.State.Rawtext
| "Script data state" -> Html5rw.Tokenizer.State.Script_data
| "CDATA section state" -> Html5rw.Tokenizer.State.Cdata_section
| s -> failwith ("Unknown state: " ^ s)
(* Convert our token to test format for comparison *)
let token_to_test_json (tok : Html5rw.Tokenizer.Token.t) : Jsont.json list =
let str s = Jsont.String (s, Jsont.Meta.none) in
let arr l = Jsont.Array (l, Jsont.Meta.none) in
match tok with
| Html5rw.Tokenizer.Token.Doctype { name; public_id; system_id; force_quirks } ->
let name_json = match name with
| Some n -> str n
| None -> Jsont.Null ((), Jsont.Meta.none)
in
let public_json = match public_id with
| Some p -> str p
| None -> Jsont.Null ((), Jsont.Meta.none)
in
let system_json = match system_id with
| Some s -> str s
| None -> Jsont.Null ((), Jsont.Meta.none)
in
let correctness = Jsont.Bool (not force_quirks, Jsont.Meta.none) in
[arr [str "DOCTYPE"; name_json; public_json; system_json; correctness]]
| Html5rw.Tokenizer.Token.Tag { kind = Start; name; attrs; self_closing } ->
let attrs_obj = Jsont.Object (
List.map (fun (n, v) -> ((n, Jsont.Meta.none), str v)) (List.rev attrs),
Jsont.Meta.none
) in
if self_closing then
[arr [str "StartTag"; str name; attrs_obj; Jsont.Bool (true, Jsont.Meta.none)]]
else
[arr [str "StartTag"; str name; attrs_obj]]
| Html5rw.Tokenizer.Token.Tag { kind = End; name; _ } ->
[arr [str "EndTag"; str name]]
| Html5rw.Tokenizer.Token.Comment data ->
[arr [str "Comment"; str data]]
| Html5rw.Tokenizer.Token.Character data ->
(* Split into individual characters for comparison - but actually
the tests expect consecutive characters to be merged *)
[arr [str "Character"; str data]]
| Html5rw.Tokenizer.Token.EOF -> []
(* Compare JSON values for equality *)
let rec json_equal a b =
match a, b with
| Jsont.Null _, Jsont.Null _ -> true
| Jsont.Bool (a, _), Jsont.Bool (b, _) -> a = b
| Jsont.Number (a, _), Jsont.Number (b, _) -> a = b
| Jsont.String (a, _), Jsont.String (b, _) -> a = b
| Jsont.Array (a, _), Jsont.Array (b, _) ->
List.length a = List.length b &&
List.for_all2 json_equal a b
| Jsont.Object (a, _), Jsont.Object (b, _) ->
let a_sorted = List.sort (fun ((n1, _), _) ((n2, _), _) -> String.compare n1 n2) a in
let b_sorted = List.sort (fun ((n1, _), _) ((n2, _), _) -> String.compare n1 n2) b in
List.length a_sorted = List.length b_sorted &&
List.for_all2 (fun ((n1, _), v1) ((n2, _), v2) -> n1 = n2 && json_equal v1 v2) a_sorted b_sorted
| _ -> false
(* Merge consecutive Character tokens *)
let merge_character_tokens tokens =
let rec loop acc = function
| [] -> List.rev acc
| Html5rw.Tokenizer.Token.Character s1 :: Html5rw.Tokenizer.Token.Character s2 :: rest ->
loop acc (Html5rw.Tokenizer.Token.Character (s1 ^ s2) :: rest)
| tok :: rest -> loop (tok :: acc) rest
in
loop [] tokens
(* Run a single test *)
let run_test test initial_state =
let input = if test.double_escaped then unescape_double test.input else test.input in
let collector = TokenCollector.create () in
let tokenizer = Html5rw.Tokenizer.create (module TokenCollector) collector ~collect_errors:true ~xml_mode:test.xml_mode () in
(* Set initial state *)
Html5rw.Tokenizer.set_state tokenizer initial_state;
(* Set last start tag if specified *)
(match test.last_start_tag with
| Some tag -> Html5rw.Tokenizer.set_last_start_tag tokenizer tag
| None -> ());
(* Run tokenizer *)
let reader = Bytes.Reader.of_string input in
Html5rw.Tokenizer.run tokenizer (module TokenCollector) reader;
(* Get results *)
let tokens = merge_character_tokens (TokenCollector.get_tokens collector) in
let actual_tokens = List.concat_map token_to_test_json tokens in
(* Unescape expected output if double_escaped *)
let expected_output = if test.double_escaped then
let rec unescape_json = function
| Jsont.String (s, m) -> Jsont.String (unescape_double s, m)
| Jsont.Array (arr, m) -> Jsont.Array (List.map unescape_json arr, m)
| Jsont.Object (obj, m) ->
Jsont.Object (List.map (fun (n, v) -> (n, unescape_json v)) obj, m)
| other -> other
in
List.map unescape_json test.output
else test.output
in
(* Merge consecutive Character tokens in expected output too *)
let rec merge_expected = function
| [] -> []
| [x] -> [x]
| Jsont.Array ([Jsont.String ("Character", _); Jsont.String (s1, m1)], am1) ::
Jsont.Array ([Jsont.String ("Character", _); Jsont.String (s2, _)], _) ::
rest ->
merge_expected (Jsont.Array ([Jsont.String ("Character", Jsont.Meta.none); Jsont.String (s1 ^ s2, m1)], am1) :: rest)
| x :: rest -> x :: merge_expected rest
in
let expected = merge_expected expected_output in
(* Compare *)
let tokens_match =
List.length actual_tokens = List.length expected &&
List.for_all2 json_equal actual_tokens expected
in
let actual_error_count = List.length (Html5rw.Tokenizer.get_errors tokenizer) in
let errors_count_match = actual_error_count = test.expected_error_count in
(tokens_match && errors_count_match, actual_tokens, expected, actual_error_count, test.expected_error_count)
(* Run all tests in a file *)
let run_file path =
let content =
let ic = open_in path in
let n = in_channel_length ic in
let s = really_input_string ic n in
close_in ic;
s
in
(* Parse JSON *)
let json = match Jsont_bytesrw.decode_string Jsont.json content with
| Ok j -> j
| Error e -> failwith (Printf.sprintf "JSON parse error in %s: %s" path e)
in
let obj = json_object json in
(* Handle both {"tests": [...]} and {"xmlViolationTests": [...]} formats *)
let regular_tests =
match json_mem "tests" obj with
| Some t -> List.map (parse_test_case ~xml_mode:false) (json_array t)
| None -> []
in
let xml_tests =
match json_mem "xmlViolationTests" obj with
| Some t -> List.map (parse_test_case ~xml_mode:true) (json_array t)
| None -> []
in
let all_tests = regular_tests @ xml_tests in
let filename = Filename.basename path in
let passed = ref 0 in
let failed = ref 0 in
let results = ref [] in
List.iteri (fun i test ->
(* Run for each initial state *)
List.iter (fun state_name ->
try
let state = state_of_string state_name in
let (success, actual, expected, actual_err_count, expected_err_count) = run_test test state in
let description = Printf.sprintf "[%s] %s" state_name test.description in
let result : Report.test_result = {
test_num = i + 1;
description;
input = test.input;
expected = String.concat "\n" (List.map json_to_string expected);
actual = String.concat "\n" (List.map json_to_string actual);
success;
details = [
("Initial State", state_name);
("Last Start Tag", Option.value test.last_start_tag ~default:"(none)");
("Double Escaped", string_of_bool test.double_escaped);
("XML Mode", string_of_bool test.xml_mode);
("Expected Errors", string_of_int expected_err_count);
("Actual Errors", string_of_int actual_err_count);
];
raw_test_data = Some test.raw_json;
} in
results := result :: !results;
if success then incr passed else incr failed
with e ->
incr failed;
let result : Report.test_result = {
test_num = i + 1;
description = Printf.sprintf "[%s] %s" state_name test.description;
input = test.input;
expected = "";
actual = Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e);
success = false;
details = [];
raw_test_data = Some test.raw_json;
} in
results := result :: !results;
Printf.eprintf "Exception in test %d (%s): %s\n" (i + 1) test.description (Printexc.to_string e)
) test.initial_states
) all_tests;
let file_result : Report.file_result = {
filename;
test_type = "Tokenizer";
passed_count = !passed;
failed_count = !failed;
tests = List.rev !results;
} in
(file_result, !passed, !failed)
let () =
let test_dir = Sys.argv.(1) in
let files = Sys.readdir test_dir |> Array.to_list in
let test_files = List.filter (fun f -> Filename.check_suffix f ".test") files in
let total_passed = ref 0 in
let total_failed = ref 0 in
let file_results = ref [] in
List.iter (fun file ->
let path = Filename.concat test_dir file in
let (file_result, passed, failed) = run_file path in
total_passed := !total_passed + passed;
total_failed := !total_failed + failed;
file_results := file_result :: !file_results;
Printf.printf "%s: %d passed, %d failed\n" file passed failed
) (List.sort String.compare test_files);
Printf.printf "\n=== Summary ===\n";
Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed;
(* Generate HTML report *)
let report : Report.report = {
title = "HTML5 Tokenizer Tests";
test_type = "tokenizer";
description = "These tests validate the HTML5 tokenization algorithm as specified in the WHATWG HTML Standard. \
The tokenizer converts HTML input into a stream of tokens (DOCTYPE, start tags, end tags, comments, \
character data, and EOF). Each test specifies input HTML, expected tokens in JSON array format, \
and the initial tokenizer state. Tests cover normal parsing, RCDATA/RAWTEXT/PLAINTEXT states, \
script data parsing, CDATA sections, and various error conditions. Some tests are double-escaped \
to represent special characters. XML violation tests check behavior differences from XML mode.";
files = List.rev !file_results;
total_passed = !total_passed;
total_failed = !total_failed;
match_quality = None;
test_type_breakdown = None;
strictness_mode = None;
run_timestamp = None;
} in
Report.generate_report report "test_tokenizer_report.html";
exit (if !total_failed > 0 then 1 else 0)