OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* Combined test runner for all html5lib-tests *)
2(* Generates a single standalone HTML report *)
3
4open Bytesrw
5
6module Report = Test_report
7
8(* ============================================================ *)
9(* Tree Construction Tests *)
10(* ============================================================ *)
11
12module TreeConstruction = struct
13 module Parser = Html5rw.Parser
14 module Dom = Html5rw.Dom
15
16 type test_case = {
17 input : string;
18 expected_tree : string;
19 expected_errors : string list;
20 script_on : bool;
21 fragment_context : string option;
22 raw_lines : string;
23 }
24
25 let parse_test_case lines =
26 let raw_lines = String.concat "\n" lines in
27 let rec parse acc = function
28 | [] -> acc
29 | line :: rest when String.length line > 0 && line.[0] = '#' ->
30 let section = String.trim line in
31 let content, remaining = collect_section rest in
32 parse ((section, content) :: acc) remaining
33 | _ :: rest -> parse acc rest
34 and collect_section lines =
35 let rec loop acc = function
36 | [] -> (List.rev acc, [])
37 | line :: rest when String.length line > 0 && line.[0] = '#' ->
38 (List.rev acc, line :: rest)
39 | line :: rest -> loop (line :: acc) rest
40 in
41 loop [] lines
42 in
43 let sections = parse [] lines in
44 let get_section name =
45 match List.assoc_opt name sections with
46 | Some lines -> String.concat "\n" lines
47 | None -> ""
48 in
49 let data = get_section "#data" in
50 let document = get_section "#document" in
51 let errors_text = get_section "#errors" in
52 let errors =
53 String.split_on_char '\n' errors_text
54 |> List.filter (fun s -> String.trim s <> "")
55 in
56 let script_on = List.mem_assoc "#script-on" sections in
57 let fragment =
58 if List.mem_assoc "#document-fragment" sections then
59 Some (get_section "#document-fragment" |> String.trim)
60 else None
61 in
62 { input = data; expected_tree = document; expected_errors = errors;
63 script_on; fragment_context = fragment; raw_lines }
64
65 let parse_dat_file content =
66 let lines = String.split_on_char '\n' content in
67 let rec split_tests current acc = function
68 | [] ->
69 if current = [] then List.rev acc
70 else List.rev (List.rev current :: acc)
71 | "" :: "#data" :: rest ->
72 let new_acc = if current = [] then acc else (List.rev current :: acc) in
73 split_tests ["#data"] new_acc rest
74 | line :: rest ->
75 split_tests (line :: current) acc rest
76 in
77 let test_groups = split_tests [] [] lines in
78 List.filter_map (fun lines ->
79 if List.exists (fun l -> l = "#data") lines then
80 Some (parse_test_case lines)
81 else None
82 ) test_groups
83
84 let strip_tree_prefix s =
85 let lines = String.split_on_char '\n' s in
86 let stripped = List.filter_map (fun line ->
87 if String.length line >= 2 && String.sub line 0 2 = "| " then
88 Some (String.sub line 2 (String.length line - 2))
89 else if String.trim line = "" then None
90 else Some line
91 ) lines in
92 String.concat "\n" stripped
93
94 let normalize_tree s =
95 let lines = String.split_on_char '\n' s in
96 let non_empty = List.filter (fun l -> String.trim l <> "") lines in
97 String.concat "\n" non_empty
98
99 let run_test test =
100 try
101 let result =
102 match test.fragment_context with
103 | Some ctx_str ->
104 let (namespace, tag_name) =
105 match String.split_on_char ' ' ctx_str with
106 | [ns; tag] when ns = "svg" -> (Some "svg", tag)
107 | [ns; tag] when ns = "math" -> (Some "mathml", tag)
108 | [tag] -> (None, tag)
109 | _ -> (None, ctx_str)
110 in
111 let context = Html5rw.Parser.make_fragment_context ~tag_name ~namespace () in
112 let reader = Bytes.Reader.of_string test.input in
113 Html5rw.Parser.parse ~collect_errors:true ~fragment_context:context reader
114 | None ->
115 let reader = Bytes.Reader.of_string test.input in
116 Html5rw.Parser.parse ~collect_errors:true reader
117 in
118 let actual_tree = Html5rw.Dom.to_test_format (Html5rw.Parser.root result) in
119 let expected = normalize_tree (strip_tree_prefix test.expected_tree) in
120 let actual = normalize_tree (strip_tree_prefix actual_tree) in
121 let error_count = List.length (Html5rw.Parser.errors result) in
122 let expected_error_count = List.length test.expected_errors in
123 (expected = actual, expected, actual, error_count, expected_error_count)
124 with e ->
125 let expected = normalize_tree (strip_tree_prefix test.expected_tree) in
126 (false, expected, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), 0, 0)
127
128 let run_file path =
129 let ic = open_in path in
130 let content = really_input_string ic (in_channel_length ic) in
131 close_in ic;
132 let tests = parse_dat_file content in
133 let filename = Filename.basename path in
134 let passed = ref 0 in
135 let failed = ref 0 in
136 let results = ref [] in
137 List.iteri (fun i test ->
138 if test.script_on then ()
139 else begin
140 let (success, expected, actual, actual_error_count, expected_error_count) = run_test test in
141 let description =
142 let input_preview = String.sub test.input 0 (min 60 (String.length test.input)) in
143 if test.fragment_context <> None then
144 Printf.sprintf "Fragment (%s): %s" (Option.get test.fragment_context) input_preview
145 else input_preview
146 in
147 let result : Report.test_result = {
148 test_num = i + 1; description; input = test.input; expected; actual; success;
149 details = [
150 ("Fragment Context", Option.value test.fragment_context ~default:"(none)");
151 ("Expected Errors", string_of_int expected_error_count);
152 ("Actual Errors", string_of_int actual_error_count);
153 ];
154 raw_test_data = Some test.raw_lines;
155 } in
156 results := result :: !results;
157 if success then incr passed else incr failed
158 end
159 ) tests;
160 let file_result : Report.file_result = {
161 filename; test_type = "Tree Construction";
162 passed_count = !passed; failed_count = !failed;
163 tests = List.rev !results;
164 } in
165 (file_result, !passed, !failed)
166
167 let run_dir test_dir =
168 let files = Sys.readdir test_dir |> Array.to_list in
169 let dat_files = List.filter (fun f ->
170 Filename.check_suffix f ".dat" && not (String.contains f '/')
171 ) files in
172 let total_passed = ref 0 in
173 let total_failed = ref 0 in
174 let file_results = ref [] in
175 List.iter (fun file ->
176 let path = Filename.concat test_dir file in
177 if Sys.is_directory path then () else begin
178 let (file_result, passed, failed) = run_file path in
179 total_passed := !total_passed + passed;
180 total_failed := !total_failed + failed;
181 file_results := file_result :: !file_results;
182 Printf.printf " %s: %d passed, %d failed\n" file passed failed
183 end
184 ) (List.sort String.compare dat_files);
185 (List.rev !file_results, !total_passed, !total_failed)
186end
187
188(* ============================================================ *)
189(* Tokenizer Tests *)
190(* ============================================================ *)
191
192module Tokenizer_tests = struct
193 module Tokenizer = Html5rw.Tokenizer
194
195 module TokenCollector = struct
196 type t = { mutable tokens : Html5rw.Tokenizer.Token.t list }
197 let create () = { tokens = [] }
198 let process t token ~line:_ ~column:_ = t.tokens <- token :: t.tokens; `Continue
199 let adjusted_current_node_in_html_namespace _ = true
200 let get_tokens t = List.rev t.tokens
201 end
202
203 type test_case = {
204 description : string;
205 input : string;
206 output : Jsont.json list;
207 expected_error_count : int;
208 initial_states : string list;
209 last_start_tag : string option;
210 double_escaped : bool;
211 xml_mode : bool;
212 raw_json : string;
213 }
214
215 let unescape_double s =
216 let b = Buffer.create (String.length s) in
217 let i = ref 0 in
218 while !i < String.length s do
219 if !i + 1 < String.length s && s.[!i] = '\\' then begin
220 match s.[!i + 1] with
221 | 'u' when !i + 5 < String.length s ->
222 let hex = String.sub s (!i + 2) 4 in
223 (try
224 let code = int_of_string ("0x" ^ hex) in
225 if code < 128 then Buffer.add_char b (Char.chr code)
226 else begin
227 if code < 0x800 then begin
228 Buffer.add_char b (Char.chr (0xC0 lor (code lsr 6)));
229 Buffer.add_char b (Char.chr (0x80 lor (code land 0x3F)))
230 end else begin
231 Buffer.add_char b (Char.chr (0xE0 lor (code lsr 12)));
232 Buffer.add_char b (Char.chr (0x80 lor ((code lsr 6) land 0x3F)));
233 Buffer.add_char b (Char.chr (0x80 lor (code land 0x3F)))
234 end
235 end;
236 i := !i + 6
237 with _ -> Buffer.add_char b s.[!i]; incr i)
238 | 'n' -> Buffer.add_char b '\n'; i := !i + 2
239 | 'r' -> Buffer.add_char b '\r'; i := !i + 2
240 | 't' -> Buffer.add_char b '\t'; i := !i + 2
241 | '\\' -> Buffer.add_char b '\\'; i := !i + 2
242 | _ -> Buffer.add_char b s.[!i]; incr i
243 end else begin
244 Buffer.add_char b s.[!i]; incr i
245 end
246 done;
247 Buffer.contents b
248
249 let json_string = function Jsont.String (s, _) -> s | _ -> failwith "Expected string"
250 let json_bool = function Jsont.Bool (b, _) -> b | _ -> failwith "Expected bool"
251 let json_array = function Jsont.Array (arr, _) -> arr | _ -> failwith "Expected array"
252 let json_object = function Jsont.Object (obj, _) -> obj | _ -> failwith "Expected object"
253
254 let json_mem name obj =
255 match List.find_opt (fun ((n, _), _) -> n = name) obj with
256 | Some (_, v) -> Some v | None -> None
257
258 let json_mem_exn name obj =
259 match json_mem name obj with Some v -> v | None -> failwith ("Missing member: " ^ name)
260
261 let rec json_to_string = function
262 | Jsont.Null _ -> "null"
263 | Jsont.Bool (b, _) -> string_of_bool b
264 | Jsont.Number (n, _) -> Printf.sprintf "%g" n
265 | Jsont.String (s, _) -> Printf.sprintf "%S" s
266 | Jsont.Array (arr, _) -> "[" ^ String.concat ", " (List.map json_to_string arr) ^ "]"
267 | Jsont.Object (obj, _) ->
268 "{" ^ String.concat ", " (List.map (fun ((n, _), v) -> Printf.sprintf "%S: %s" n (json_to_string v)) obj) ^ "}"
269
270 let parse_test_case ~xml_mode json =
271 let raw_json = json_to_string json in
272 let obj = json_object json in
273 let description = json_string (json_mem_exn "description" obj) in
274 let input = json_string (json_mem_exn "input" obj) in
275 let output = json_array (json_mem_exn "output" obj) in
276 let expected_error_count = match json_mem "errors" obj with
277 | Some e -> List.length (json_array e) | None -> 0
278 in
279 let initial_states = match json_mem "initialStates" obj with
280 | Some s -> List.map json_string (json_array s) | None -> ["Data state"]
281 in
282 let last_start_tag = match json_mem "lastStartTag" obj with
283 | Some s -> Some (json_string s) | None -> None
284 in
285 let double_escaped = match json_mem "doubleEscaped" obj with
286 | Some b -> json_bool b | None -> false
287 in
288 { description; input; output; expected_error_count; initial_states;
289 last_start_tag; double_escaped; xml_mode; raw_json }
290
291 let state_of_string = function
292 | "Data state" -> Html5rw.Tokenizer.State.Data
293 | "PLAINTEXT state" -> Html5rw.Tokenizer.State.Plaintext
294 | "RCDATA state" -> Html5rw.Tokenizer.State.Rcdata
295 | "RAWTEXT state" -> Html5rw.Tokenizer.State.Rawtext
296 | "Script data state" -> Html5rw.Tokenizer.State.Script_data
297 | "CDATA section state" -> Html5rw.Tokenizer.State.Cdata_section
298 | s -> failwith ("Unknown state: " ^ s)
299
300 let token_to_test_json (tok : Html5rw.Tokenizer.Token.t) : Jsont.json list =
301 let str s = Jsont.String (s, Jsont.Meta.none) in
302 let arr l = Jsont.Array (l, Jsont.Meta.none) in
303 match tok with
304 | Html5rw.Tokenizer.Token.Doctype { name; public_id; system_id; force_quirks } ->
305 let name_json = match name with Some n -> str n | None -> Jsont.Null ((), Jsont.Meta.none) in
306 let public_json = match public_id with Some p -> str p | None -> Jsont.Null ((), Jsont.Meta.none) in
307 let system_json = match system_id with Some s -> str s | None -> Jsont.Null ((), Jsont.Meta.none) in
308 let correctness = Jsont.Bool (not force_quirks, Jsont.Meta.none) in
309 [arr [str "DOCTYPE"; name_json; public_json; system_json; correctness]]
310 | Html5rw.Tokenizer.Token.Tag { kind = Start; name; attrs; self_closing } ->
311 let attrs_obj = Jsont.Object (
312 List.map (fun (n, v) -> ((n, Jsont.Meta.none), str v)) (List.rev attrs),
313 Jsont.Meta.none
314 ) in
315 if self_closing then [arr [str "StartTag"; str name; attrs_obj; Jsont.Bool (true, Jsont.Meta.none)]]
316 else [arr [str "StartTag"; str name; attrs_obj]]
317 | Html5rw.Tokenizer.Token.Tag { kind = End; name; _ } -> [arr [str "EndTag"; str name]]
318 | Html5rw.Tokenizer.Token.Comment data -> [arr [str "Comment"; str data]]
319 | Html5rw.Tokenizer.Token.Character data -> [arr [str "Character"; str data]]
320 | Html5rw.Tokenizer.Token.EOF -> []
321
322 let rec json_equal a b =
323 match a, b with
324 | Jsont.Null _, Jsont.Null _ -> true
325 | Jsont.Bool (a, _), Jsont.Bool (b, _) -> a = b
326 | Jsont.Number (a, _), Jsont.Number (b, _) -> a = b
327 | Jsont.String (a, _), Jsont.String (b, _) -> a = b
328 | Jsont.Array (a, _), Jsont.Array (b, _) ->
329 List.length a = List.length b && List.for_all2 json_equal a b
330 | Jsont.Object (a, _), Jsont.Object (b, _) ->
331 let a_sorted = List.sort (fun ((n1, _), _) ((n2, _), _) -> String.compare n1 n2) a in
332 let b_sorted = List.sort (fun ((n1, _), _) ((n2, _), _) -> String.compare n1 n2) b in
333 List.length a_sorted = List.length b_sorted &&
334 List.for_all2 (fun ((n1, _), v1) ((n2, _), v2) -> n1 = n2 && json_equal v1 v2) a_sorted b_sorted
335 | _ -> false
336
337 let merge_character_tokens tokens =
338 let rec loop acc = function
339 | [] -> List.rev acc
340 | Html5rw.Tokenizer.Token.Character s1 :: Html5rw.Tokenizer.Token.Character s2 :: rest ->
341 loop acc (Html5rw.Tokenizer.Token.Character (s1 ^ s2) :: rest)
342 | tok :: rest -> loop (tok :: acc) rest
343 in loop [] tokens
344
345 let run_test test initial_state =
346 let input = if test.double_escaped then unescape_double test.input else test.input in
347 let collector = TokenCollector.create () in
348 let tokenizer = Html5rw.Tokenizer.create (module TokenCollector) collector ~collect_errors:true ~xml_mode:test.xml_mode () in
349 Html5rw.Tokenizer.set_state tokenizer initial_state;
350 (match test.last_start_tag with Some tag -> Html5rw.Tokenizer.set_last_start_tag tokenizer tag | None -> ());
351 let reader = Bytes.Reader.of_string input in
352 Html5rw.Tokenizer.run tokenizer (module TokenCollector) reader;
353 let tokens = merge_character_tokens (TokenCollector.get_tokens collector) in
354 let actual_tokens = List.concat_map token_to_test_json tokens in
355 let expected_output = if test.double_escaped then
356 let rec unescape_json = function
357 | Jsont.String (s, m) -> Jsont.String (unescape_double s, m)
358 | Jsont.Array (arr, m) -> Jsont.Array (List.map unescape_json arr, m)
359 | Jsont.Object (obj, m) -> Jsont.Object (List.map (fun (n, v) -> (n, unescape_json v)) obj, m)
360 | other -> other
361 in List.map unescape_json test.output
362 else test.output in
363 let rec merge_expected = function
364 | [] -> []
365 | [x] -> [x]
366 | Jsont.Array ([Jsont.String ("Character", _); Jsont.String (s1, m1)], am1) ::
367 Jsont.Array ([Jsont.String ("Character", _); Jsont.String (s2, _)], _) :: rest ->
368 merge_expected (Jsont.Array ([Jsont.String ("Character", Jsont.Meta.none); Jsont.String (s1 ^ s2, m1)], am1) :: rest)
369 | x :: rest -> x :: merge_expected rest
370 in
371 let expected = merge_expected expected_output in
372 let tokens_match =
373 List.length actual_tokens = List.length expected &&
374 List.for_all2 json_equal actual_tokens expected
375 in
376 let actual_error_count = List.length (Html5rw.Tokenizer.get_errors tokenizer) in
377 let errors_count_match = actual_error_count = test.expected_error_count in
378 (tokens_match && errors_count_match, actual_tokens, expected, actual_error_count, test.expected_error_count)
379
380 let run_file path =
381 let content =
382 let ic = open_in path in
383 let n = in_channel_length ic in
384 let s = really_input_string ic n in
385 close_in ic; s
386 in
387 let json = match Jsont_bytesrw.decode_string Jsont.json content with
388 | Ok j -> j
389 | Error e -> failwith (Printf.sprintf "JSON parse error in %s: %s" path e)
390 in
391 let obj = json_object json in
392 let regular_tests = match json_mem "tests" obj with
393 | Some t -> List.map (parse_test_case ~xml_mode:false) (json_array t) | None -> []
394 in
395 let xml_tests = match json_mem "xmlViolationTests" obj with
396 | Some t -> List.map (parse_test_case ~xml_mode:true) (json_array t) | None -> []
397 in
398 let all_tests = regular_tests @ xml_tests in
399 let filename = Filename.basename path in
400 let passed = ref 0 in
401 let failed = ref 0 in
402 let results = ref [] in
403 List.iteri (fun i test ->
404 List.iter (fun state_name ->
405 try
406 let state = state_of_string state_name in
407 let (success, actual, expected, actual_err_count, expected_err_count) = run_test test state in
408 let description = Printf.sprintf "[%s] %s" state_name test.description in
409 let result : Report.test_result = {
410 test_num = i + 1; description; input = test.input;
411 expected = String.concat "\n" (List.map json_to_string expected);
412 actual = String.concat "\n" (List.map json_to_string actual);
413 success;
414 details = [
415 ("Initial State", state_name);
416 ("Last Start Tag", Option.value test.last_start_tag ~default:"(none)");
417 ("Double Escaped", string_of_bool test.double_escaped);
418 ("XML Mode", string_of_bool test.xml_mode);
419 ("Expected Errors", string_of_int expected_err_count);
420 ("Actual Errors", string_of_int actual_err_count);
421 ];
422 raw_test_data = Some test.raw_json;
423 } in
424 results := result :: !results;
425 if success then incr passed else incr failed
426 with e ->
427 incr failed;
428 let result : Report.test_result = {
429 test_num = i + 1;
430 description = Printf.sprintf "[%s] %s" state_name test.description;
431 input = test.input; expected = "";
432 actual = Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e);
433 success = false; details = [];
434 raw_test_data = Some test.raw_json;
435 } in
436 results := result :: !results
437 ) test.initial_states
438 ) all_tests;
439 let file_result : Report.file_result = {
440 filename; test_type = "Tokenizer";
441 passed_count = !passed; failed_count = !failed;
442 tests = List.rev !results;
443 } in
444 (file_result, !passed, !failed)
445
446 let run_dir test_dir =
447 let files = Sys.readdir test_dir |> Array.to_list in
448 let test_files = List.filter (fun f -> Filename.check_suffix f ".test") files in
449 let total_passed = ref 0 in
450 let total_failed = ref 0 in
451 let file_results = ref [] in
452 List.iter (fun file ->
453 let path = Filename.concat test_dir file in
454 let (file_result, passed, failed) = run_file path in
455 total_passed := !total_passed + passed;
456 total_failed := !total_failed + failed;
457 file_results := file_result :: !file_results;
458 Printf.printf " %s: %d passed, %d failed\n" file passed failed
459 ) (List.sort String.compare test_files);
460 (List.rev !file_results, !total_passed, !total_failed)
461end
462
463(* ============================================================ *)
464(* Encoding Tests *)
465(* ============================================================ *)
466
467module Encoding_tests = struct
468 module Encoding = Html5rw.Encoding
469
470 type test_case = {
471 input : string;
472 expected_encoding : string;
473 raw_lines : string;
474 }
475
476 let normalize_encoding_name s = String.lowercase_ascii (String.trim s)
477
478 let encoding_to_test_name = function
479 | Html5rw.Encoding.Utf8 -> "utf-8"
480 | Html5rw.Encoding.Utf16le -> "utf-16le"
481 | Html5rw.Encoding.Utf16be -> "utf-16be"
482 | Html5rw.Encoding.Windows_1252 -> "windows-1252"
483 | Html5rw.Encoding.Iso_8859_2 -> "iso-8859-2"
484 | Html5rw.Encoding.Euc_jp -> "euc-jp"
485
486 let parse_test_case lines =
487 let raw_lines = String.concat "\n" lines in
488 let rec parse acc = function
489 | [] -> acc
490 | line :: rest when String.length line > 0 && line.[0] = '#' ->
491 let section = String.trim line in
492 let content, remaining = collect_section rest in
493 parse ((section, content) :: acc) remaining
494 | _ :: rest -> parse acc rest
495 and collect_section lines =
496 let rec loop acc = function
497 | [] -> (List.rev acc, [])
498 | line :: rest when String.length line > 0 && line.[0] = '#' ->
499 (List.rev acc, line :: rest)
500 | line :: rest -> loop (line :: acc) rest
501 in loop [] lines
502 in
503 let sections = parse [] lines in
504 let get_section name =
505 match List.assoc_opt name sections with
506 | Some lines -> String.concat "\n" lines | None -> ""
507 in
508 let data = get_section "#data" in
509 let encoding = get_section "#encoding" in
510 { input = data; expected_encoding = String.trim encoding; raw_lines }
511
512 let parse_dat_file content =
513 let lines = String.split_on_char '\n' content in
514 let rec split_tests current acc = function
515 | [] -> if current = [] then List.rev acc else List.rev (List.rev current :: acc)
516 | "" :: "#data" :: rest ->
517 let new_acc = if current = [] then acc else (List.rev current :: acc) in
518 split_tests ["#data"] new_acc rest
519 | line :: rest -> split_tests (line :: current) acc rest
520 in
521 let test_groups = split_tests [] [] lines in
522 List.filter_map (fun lines ->
523 if List.exists (fun l -> l = "#data") lines then Some (parse_test_case lines)
524 else None
525 ) test_groups
526
527 let run_test test =
528 try
529 let (_, detected_encoding) = Html5rw.Encoding.decode (Bytes.of_string test.input) () in
530 let detected_name = encoding_to_test_name detected_encoding in
531 let expected_name = normalize_encoding_name test.expected_encoding in
532 let match_encoding det exp =
533 det = exp ||
534 (det = "windows-1252" && (exp = "windows-1252" || exp = "cp1252" || exp = "iso-8859-1")) ||
535 (det = "iso-8859-2" && (exp = "iso-8859-2" || exp = "iso8859-2" || exp = "latin2")) ||
536 (det = "utf-8" && (exp = "utf-8" || exp = "utf8")) ||
537 (det = "euc-jp" && (exp = "euc-jp" || exp = "eucjp"))
538 in
539 (match_encoding detected_name expected_name, detected_name, expected_name)
540 with e ->
541 (false, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), test.expected_encoding)
542
543 let run_file path =
544 let ic = open_in path in
545 let content = really_input_string ic (in_channel_length ic) in
546 close_in ic;
547 let tests = parse_dat_file content in
548 let filename = Filename.basename path in
549 let passed = ref 0 in
550 let failed = ref 0 in
551 let results = ref [] in
552 List.iteri (fun i test ->
553 if String.trim test.expected_encoding = "" then ()
554 else begin
555 let (success, detected, expected) = run_test test in
556 let result : Report.test_result = {
557 test_num = i + 1;
558 description = Printf.sprintf "Detect %s encoding" expected;
559 input = String.escaped test.input;
560 expected; actual = detected; success;
561 details = [
562 ("Input Length", string_of_int (String.length test.input));
563 ("Has BOM", string_of_bool (String.length test.input >= 3 &&
564 (String.sub test.input 0 3 = "\xEF\xBB\xBF" ||
565 String.sub test.input 0 2 = "\xFF\xFE" ||
566 String.sub test.input 0 2 = "\xFE\xFF")));
567 ];
568 raw_test_data = Some test.raw_lines;
569 } in
570 results := result :: !results;
571 if success then incr passed else incr failed
572 end
573 ) tests;
574 let file_result : Report.file_result = {
575 filename; test_type = "Encoding Detection";
576 passed_count = !passed; failed_count = !failed;
577 tests = List.rev !results;
578 } in
579 (file_result, !passed, !failed)
580
581 let run_dir test_dir =
582 let files = Sys.readdir test_dir |> Array.to_list in
583 let dat_files = List.filter (fun f ->
584 Filename.check_suffix f ".dat" && not (String.contains f '/')
585 ) files in
586 let total_passed = ref 0 in
587 let total_failed = ref 0 in
588 let file_results = ref [] in
589 List.iter (fun file ->
590 let path = Filename.concat test_dir file in
591 if Sys.is_directory path then () else begin
592 let (file_result, passed, failed) = run_file path in
593 total_passed := !total_passed + passed;
594 total_failed := !total_failed + failed;
595 file_results := file_result :: !file_results;
596 Printf.printf " %s: %d passed, %d failed\n" file passed failed
597 end
598 ) (List.sort String.compare dat_files);
599 (List.rev !file_results, !total_passed, !total_failed)
600end
601
602(* ============================================================ *)
603(* Main Entry Point *)
604(* ============================================================ *)
605
606let () =
607 let base_dir = if Array.length Sys.argv > 1 then Sys.argv.(1) else "html5lib-tests" in
608
609 let all_files = ref [] in
610 let total_passed = ref 0 in
611 let total_failed = ref 0 in
612
613 (* Run Tree Construction Tests *)
614 Printf.printf "\n=== Tree Construction Tests ===\n";
615 let tree_dir = Filename.concat base_dir "tree-construction" in
616 if Sys.file_exists tree_dir then begin
617 let (files, passed, failed) = TreeConstruction.run_dir tree_dir in
618 all_files := !all_files @ files;
619 total_passed := !total_passed + passed;
620 total_failed := !total_failed + failed;
621 Printf.printf " Subtotal: %d passed, %d failed\n" passed failed
622 end else
623 Printf.printf " (directory not found: %s)\n" tree_dir;
624
625 (* Run Tokenizer Tests *)
626 Printf.printf "\n=== Tokenizer Tests ===\n";
627 let tok_dir = Filename.concat base_dir "tokenizer" in
628 if Sys.file_exists tok_dir then begin
629 let (files, passed, failed) = Tokenizer_tests.run_dir tok_dir in
630 all_files := !all_files @ files;
631 total_passed := !total_passed + passed;
632 total_failed := !total_failed + failed;
633 Printf.printf " Subtotal: %d passed, %d failed\n" passed failed
634 end else
635 Printf.printf " (directory not found: %s)\n" tok_dir;
636
637 (* Run Encoding Tests *)
638 Printf.printf "\n=== Encoding Detection Tests ===\n";
639 let enc_dir = Filename.concat base_dir "encoding" in
640 if Sys.file_exists enc_dir then begin
641 let (files, passed, failed) = Encoding_tests.run_dir enc_dir in
642 all_files := !all_files @ files;
643 total_passed := !total_passed + passed;
644 total_failed := !total_failed + failed;
645 Printf.printf " Subtotal: %d passed, %d failed\n" passed failed
646 end else
647 Printf.printf " (directory not found: %s)\n" enc_dir;
648
649 (* Note: Serializer tests use the standalone test_serializer.exe for full implementation *)
650
651 Printf.printf "\n=== Overall Summary ===\n";
652 Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed;
653
654 (* Generate combined HTML report *)
655 let report : Report.report = {
656 title = "HTML5 Parser Test Suite";
657 test_type = "combined";
658 description = "This is a comprehensive test report for the html5rw OCaml HTML5 parser library, \
659 validating conformance against the official html5lib-tests test suite. \
660 Tests cover: (1) Tree Construction - validating the DOM tree building algorithm; \
661 (2) Tokenization - validating the HTML tokenizer state machine; \
662 (3) Encoding Detection - validating character encoding sniffing. \
663 Each test shows the input, expected output, actual output, and original test data.";
664 files = !all_files;
665 total_passed = !total_passed;
666 total_failed = !total_failed;
667 } in
668 Report.generate_report report "html5lib_test_report.html";
669
670 exit (if !total_failed > 0 then 1 else 0)