OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* Test runner for html5lib-tests tokenizer tests *)
2
3open Bytesrw
4
5module Tokenizer = Html5rw.Tokenizer
6module Report = Test_report
7
8(* Token collector sink - collects all tokens into a list *)
9module TokenCollector = struct
10 type t = {
11 mutable tokens : Html5rw.Tokenizer.Token.t list;
12 }
13
14 let create () = { tokens = [] }
15
16 let process t token ~line:_ ~column:_ =
17 t.tokens <- token :: t.tokens;
18 `Continue
19
20 let adjusted_current_node_in_html_namespace _ = true
21
22 let get_tokens t = List.rev t.tokens
23end
24
25(* Test case representation *)
26type test_case = {
27 description : string;
28 input : string;
29 output : Jsont.json list;
30 expected_error_count : int;
31 initial_states : string list;
32 last_start_tag : string option;
33 double_escaped : bool;
34 xml_mode : bool;
35 raw_json : string; (* Original JSON representation of this test *)
36}
37
38(* Unescape double-escaped strings from tests *)
39let unescape_double s =
40 let b = Buffer.create (String.length s) in
41 let i = ref 0 in
42 while !i < String.length s do
43 if !i + 1 < String.length s && s.[!i] = '\\' then begin
44 match s.[!i + 1] with
45 | 'u' when !i + 5 < String.length s ->
46 let hex = String.sub s (!i + 2) 4 in
47 (try
48 let code = int_of_string ("0x" ^ hex) in
49 if code < 128 then Buffer.add_char b (Char.chr code)
50 else begin
51 (* UTF-8 encode *)
52 if code < 0x800 then begin
53 Buffer.add_char b (Char.chr (0xC0 lor (code lsr 6)));
54 Buffer.add_char b (Char.chr (0x80 lor (code land 0x3F)))
55 end else begin
56 Buffer.add_char b (Char.chr (0xE0 lor (code lsr 12)));
57 Buffer.add_char b (Char.chr (0x80 lor ((code lsr 6) land 0x3F)));
58 Buffer.add_char b (Char.chr (0x80 lor (code land 0x3F)))
59 end
60 end;
61 i := !i + 6
62 with _ ->
63 Buffer.add_char b s.[!i];
64 incr i)
65 | 'n' -> Buffer.add_char b '\n'; i := !i + 2
66 | 'r' -> Buffer.add_char b '\r'; i := !i + 2
67 | 't' -> Buffer.add_char b '\t'; i := !i + 2
68 | '\\' -> Buffer.add_char b '\\'; i := !i + 2
69 | _ -> Buffer.add_char b s.[!i]; incr i
70 end else begin
71 Buffer.add_char b s.[!i];
72 incr i
73 end
74 done;
75 Buffer.contents b
76
77(* Extract string from JSON node *)
78let json_string = function
79 | Jsont.String (s, _) -> s
80 | _ -> failwith "Expected string"
81
82let json_bool = function
83 | Jsont.Bool (b, _) -> b
84 | _ -> failwith "Expected bool"
85
86let json_array = function
87 | Jsont.Array (arr, _) -> arr
88 | _ -> failwith "Expected array"
89
90let json_object = function
91 | Jsont.Object (obj, _) -> obj
92 | _ -> failwith "Expected object"
93
94let json_mem name obj =
95 match List.find_opt (fun ((n, _), _) -> n = name) obj with
96 | Some (_, v) -> Some v
97 | None -> None
98
99let json_mem_exn name obj =
100 match json_mem name obj with
101 | Some v -> v
102 | None -> failwith ("Missing member: " ^ name)
103
104(* Format JSON for display *)
105let rec json_to_string = function
106 | Jsont.Null _ -> "null"
107 | Jsont.Bool (b, _) -> string_of_bool b
108 | Jsont.Number (n, _) -> Printf.sprintf "%g" n
109 | Jsont.String (s, _) -> Printf.sprintf "%S" s
110 | Jsont.Array (arr, _) ->
111 "[" ^ String.concat ", " (List.map json_to_string arr) ^ "]"
112 | Jsont.Object (obj, _) ->
113 "{" ^ String.concat ", " (List.map (fun ((n, _), v) -> Printf.sprintf "%S: %s" n (json_to_string v)) obj) ^ "}"
114
115(* Parse a single test case from JSON *)
116let parse_test_case ~xml_mode json =
117 let raw_json = json_to_string json in
118 let obj = json_object json in
119 let description = json_string (json_mem_exn "description" obj) in
120 let input = json_string (json_mem_exn "input" obj) in
121 let output = json_array (json_mem_exn "output" obj) in
122 let expected_error_count = match json_mem "errors" obj with
123 | Some e -> List.length (json_array e)
124 | None -> 0
125 in
126 let initial_states = match json_mem "initialStates" obj with
127 | Some s -> List.map json_string (json_array s)
128 | None -> ["Data state"]
129 in
130 let last_start_tag = match json_mem "lastStartTag" obj with
131 | Some s -> Some (json_string s)
132 | None -> None
133 in
134 let double_escaped = match json_mem "doubleEscaped" obj with
135 | Some b -> json_bool b
136 | None -> false
137 in
138 { description; input; output; expected_error_count; initial_states; last_start_tag; double_escaped; xml_mode; raw_json }
139
140(* Convert state name to State.t *)
141let state_of_string = function
142 | "Data state" -> Html5rw.Tokenizer.State.Data
143 | "PLAINTEXT state" -> Html5rw.Tokenizer.State.Plaintext
144 | "RCDATA state" -> Html5rw.Tokenizer.State.Rcdata
145 | "RAWTEXT state" -> Html5rw.Tokenizer.State.Rawtext
146 | "Script data state" -> Html5rw.Tokenizer.State.Script_data
147 | "CDATA section state" -> Html5rw.Tokenizer.State.Cdata_section
148 | s -> failwith ("Unknown state: " ^ s)
149
150(* Convert our token to test format for comparison *)
151let token_to_test_json (tok : Html5rw.Tokenizer.Token.t) : Jsont.json list =
152 let str s = Jsont.String (s, Jsont.Meta.none) in
153 let arr l = Jsont.Array (l, Jsont.Meta.none) in
154 match tok with
155 | Html5rw.Tokenizer.Token.Doctype { name; public_id; system_id; force_quirks } ->
156 let name_json = match name with
157 | Some n -> str n
158 | None -> Jsont.Null ((), Jsont.Meta.none)
159 in
160 let public_json = match public_id with
161 | Some p -> str p
162 | None -> Jsont.Null ((), Jsont.Meta.none)
163 in
164 let system_json = match system_id with
165 | Some s -> str s
166 | None -> Jsont.Null ((), Jsont.Meta.none)
167 in
168 let correctness = Jsont.Bool (not force_quirks, Jsont.Meta.none) in
169 [arr [str "DOCTYPE"; name_json; public_json; system_json; correctness]]
170 | Html5rw.Tokenizer.Token.Tag { kind = Start; name; attrs; self_closing } ->
171 let attrs_obj = Jsont.Object (
172 List.map (fun (n, v) -> ((n, Jsont.Meta.none), str v)) (List.rev attrs),
173 Jsont.Meta.none
174 ) in
175 if self_closing then
176 [arr [str "StartTag"; str name; attrs_obj; Jsont.Bool (true, Jsont.Meta.none)]]
177 else
178 [arr [str "StartTag"; str name; attrs_obj]]
179 | Html5rw.Tokenizer.Token.Tag { kind = End; name; _ } ->
180 [arr [str "EndTag"; str name]]
181 | Html5rw.Tokenizer.Token.Comment data ->
182 [arr [str "Comment"; str data]]
183 | Html5rw.Tokenizer.Token.Character data ->
184 (* Split into individual characters for comparison - but actually
185 the tests expect consecutive characters to be merged *)
186 [arr [str "Character"; str data]]
187 | Html5rw.Tokenizer.Token.EOF -> []
188
189(* Compare JSON values for equality *)
190let rec json_equal a b =
191 match a, b with
192 | Jsont.Null _, Jsont.Null _ -> true
193 | Jsont.Bool (a, _), Jsont.Bool (b, _) -> a = b
194 | Jsont.Number (a, _), Jsont.Number (b, _) -> a = b
195 | Jsont.String (a, _), Jsont.String (b, _) -> a = b
196 | Jsont.Array (a, _), Jsont.Array (b, _) ->
197 List.length a = List.length b &&
198 List.for_all2 json_equal a b
199 | Jsont.Object (a, _), Jsont.Object (b, _) ->
200 let a_sorted = List.sort (fun ((n1, _), _) ((n2, _), _) -> String.compare n1 n2) a in
201 let b_sorted = List.sort (fun ((n1, _), _) ((n2, _), _) -> String.compare n1 n2) b in
202 List.length a_sorted = List.length b_sorted &&
203 List.for_all2 (fun ((n1, _), v1) ((n2, _), v2) -> n1 = n2 && json_equal v1 v2) a_sorted b_sorted
204 | _ -> false
205
206(* Merge consecutive Character tokens *)
207let merge_character_tokens tokens =
208 let rec loop acc = function
209 | [] -> List.rev acc
210 | Html5rw.Tokenizer.Token.Character s1 :: Html5rw.Tokenizer.Token.Character s2 :: rest ->
211 loop acc (Html5rw.Tokenizer.Token.Character (s1 ^ s2) :: rest)
212 | tok :: rest -> loop (tok :: acc) rest
213 in
214 loop [] tokens
215
216(* Run a single test *)
217let run_test test initial_state =
218 let input = if test.double_escaped then unescape_double test.input else test.input in
219
220 let collector = TokenCollector.create () in
221 let tokenizer = Html5rw.Tokenizer.create (module TokenCollector) collector ~collect_errors:true ~xml_mode:test.xml_mode () in
222
223 (* Set initial state *)
224 Html5rw.Tokenizer.set_state tokenizer initial_state;
225
226 (* Set last start tag if specified *)
227 (match test.last_start_tag with
228 | Some tag -> Html5rw.Tokenizer.set_last_start_tag tokenizer tag
229 | None -> ());
230
231 (* Run tokenizer *)
232 let reader = Bytes.Reader.of_string input in
233 Html5rw.Tokenizer.run tokenizer (module TokenCollector) reader;
234
235 (* Get results *)
236 let tokens = merge_character_tokens (TokenCollector.get_tokens collector) in
237 let actual_tokens = List.concat_map token_to_test_json tokens in
238
239 (* Unescape expected output if double_escaped *)
240 let expected_output = if test.double_escaped then
241 let rec unescape_json = function
242 | Jsont.String (s, m) -> Jsont.String (unescape_double s, m)
243 | Jsont.Array (arr, m) -> Jsont.Array (List.map unescape_json arr, m)
244 | Jsont.Object (obj, m) ->
245 Jsont.Object (List.map (fun (n, v) -> (n, unescape_json v)) obj, m)
246 | other -> other
247 in
248 List.map unescape_json test.output
249 else test.output
250 in
251
252 (* Merge consecutive Character tokens in expected output too *)
253 let rec merge_expected = function
254 | [] -> []
255 | [x] -> [x]
256 | Jsont.Array ([Jsont.String ("Character", _); Jsont.String (s1, m1)], am1) ::
257 Jsont.Array ([Jsont.String ("Character", _); Jsont.String (s2, _)], _) ::
258 rest ->
259 merge_expected (Jsont.Array ([Jsont.String ("Character", Jsont.Meta.none); Jsont.String (s1 ^ s2, m1)], am1) :: rest)
260 | x :: rest -> x :: merge_expected rest
261 in
262 let expected = merge_expected expected_output in
263
264 (* Compare *)
265 let tokens_match =
266 List.length actual_tokens = List.length expected &&
267 List.for_all2 json_equal actual_tokens expected
268 in
269
270 let actual_error_count = List.length (Html5rw.Tokenizer.get_errors tokenizer) in
271 let errors_count_match = actual_error_count = test.expected_error_count in
272
273 (tokens_match && errors_count_match, actual_tokens, expected, actual_error_count, test.expected_error_count)
274
275(* Run all tests in a file *)
276let run_file path =
277 let content =
278 let ic = open_in path in
279 let n = in_channel_length ic in
280 let s = really_input_string ic n in
281 close_in ic;
282 s
283 in
284
285 (* Parse JSON *)
286 let json = match Jsont_bytesrw.decode_string Jsont.json content with
287 | Ok j -> j
288 | Error e -> failwith (Printf.sprintf "JSON parse error in %s: %s" path e)
289 in
290
291 let obj = json_object json in
292
293 (* Handle both {"tests": [...]} and {"xmlViolationTests": [...]} formats *)
294 let regular_tests =
295 match json_mem "tests" obj with
296 | Some t -> List.map (parse_test_case ~xml_mode:false) (json_array t)
297 | None -> []
298 in
299 let xml_tests =
300 match json_mem "xmlViolationTests" obj with
301 | Some t -> List.map (parse_test_case ~xml_mode:true) (json_array t)
302 | None -> []
303 in
304 let all_tests = regular_tests @ xml_tests in
305
306 let filename = Filename.basename path in
307 let passed = ref 0 in
308 let failed = ref 0 in
309 let results = ref [] in
310
311 List.iteri (fun i test ->
312 (* Run for each initial state *)
313 List.iter (fun state_name ->
314 try
315 let state = state_of_string state_name in
316 let (success, actual, expected, actual_err_count, expected_err_count) = run_test test state in
317
318 let description = Printf.sprintf "[%s] %s" state_name test.description in
319 let result : Report.test_result = {
320 test_num = i + 1;
321 description;
322 input = test.input;
323 expected = String.concat "\n" (List.map json_to_string expected);
324 actual = String.concat "\n" (List.map json_to_string actual);
325 success;
326 details = [
327 ("Initial State", state_name);
328 ("Last Start Tag", Option.value test.last_start_tag ~default:"(none)");
329 ("Double Escaped", string_of_bool test.double_escaped);
330 ("XML Mode", string_of_bool test.xml_mode);
331 ("Expected Errors", string_of_int expected_err_count);
332 ("Actual Errors", string_of_int actual_err_count);
333 ];
334 raw_test_data = Some test.raw_json;
335 } in
336 results := result :: !results;
337
338 if success then incr passed else incr failed
339 with e ->
340 incr failed;
341 let result : Report.test_result = {
342 test_num = i + 1;
343 description = Printf.sprintf "[%s] %s" state_name test.description;
344 input = test.input;
345 expected = "";
346 actual = Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e);
347 success = false;
348 details = [];
349 raw_test_data = Some test.raw_json;
350 } in
351 results := result :: !results;
352 Printf.eprintf "Exception in test %d (%s): %s\n" (i + 1) test.description (Printexc.to_string e)
353 ) test.initial_states
354 ) all_tests;
355
356 let file_result : Report.file_result = {
357 filename;
358 test_type = "Tokenizer";
359 passed_count = !passed;
360 failed_count = !failed;
361 tests = List.rev !results;
362 } in
363 (file_result, !passed, !failed)
364
365let () =
366 let test_dir = Sys.argv.(1) in
367 let files = Sys.readdir test_dir |> Array.to_list in
368 let test_files = List.filter (fun f -> Filename.check_suffix f ".test") files in
369
370 let total_passed = ref 0 in
371 let total_failed = ref 0 in
372 let file_results = ref [] in
373
374 List.iter (fun file ->
375 let path = Filename.concat test_dir file in
376 let (file_result, passed, failed) = run_file path in
377 total_passed := !total_passed + passed;
378 total_failed := !total_failed + failed;
379 file_results := file_result :: !file_results;
380 Printf.printf "%s: %d passed, %d failed\n" file passed failed
381 ) (List.sort String.compare test_files);
382
383 Printf.printf "\n=== Summary ===\n";
384 Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed;
385
386 (* Generate HTML report *)
387 let report : Report.report = {
388 title = "HTML5 Tokenizer Tests";
389 test_type = "tokenizer";
390 description = "These tests validate the HTML5 tokenization algorithm as specified in the WHATWG HTML Standard. \
391 The tokenizer converts HTML input into a stream of tokens (DOCTYPE, start tags, end tags, comments, \
392 character data, and EOF). Each test specifies input HTML, expected tokens in JSON array format, \
393 and the initial tokenizer state. Tests cover normal parsing, RCDATA/RAWTEXT/PLAINTEXT states, \
394 script data parsing, CDATA sections, and various error conditions. Some tests are double-escaped \
395 to represent special characters. XML violation tests check behavior differences from XML mode.";
396 files = List.rev !file_results;
397 total_passed = !total_passed;
398 total_failed = !total_failed;
399 match_quality = None;
400 test_type_breakdown = None;
401 strictness_mode = None;
402 run_timestamp = None;
403 } in
404 Report.generate_report report "test_tokenizer_report.html";
405
406 exit (if !total_failed > 0 then 1 else 0)