OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* Test runner for html5lib-tests tree construction tests *)
2
3open Bytesrw
4
5module Parser = Html5rw.Parser
6module Dom = Html5rw.Dom
7module Report = Test_report
8
9type test_case = {
10 input : string;
11 expected_tree : string;
12 expected_errors : string list;
13 script_on : bool;
14 fragment_context : string option;
15 raw_lines : string; (* Original test data from .dat file *)
16}
17
18let _is_blank s = String.trim s = ""
19
20(* Parse a single test case from lines *)
21let parse_test_case lines =
22 let raw_lines = String.concat "\n" lines in
23 let rec parse acc = function
24 | [] -> acc
25 | line :: rest when String.length line > 0 && line.[0] = '#' ->
26 let section = String.trim line in
27 let content, remaining = collect_section rest in
28 parse ((section, content) :: acc) remaining
29 | _ :: rest -> parse acc rest
30 and collect_section lines =
31 let rec loop acc = function
32 | [] -> (List.rev acc, [])
33 | line :: rest when String.length line > 0 && line.[0] = '#' ->
34 (List.rev acc, line :: rest)
35 | line :: rest -> loop (line :: acc) rest
36 in
37 loop [] lines
38 in
39 let sections = parse [] lines in
40
41 let get_section name =
42 match List.assoc_opt name sections with
43 | Some lines -> String.concat "\n" lines
44 | None -> ""
45 in
46
47 let data = get_section "#data" in
48 let document = get_section "#document" in
49 let errors_text = get_section "#errors" in
50 let errors =
51 String.split_on_char '\n' errors_text
52 |> List.filter (fun s -> String.trim s <> "")
53 in
54 let script_on = List.mem_assoc "#script-on" sections in
55 let fragment =
56 if List.mem_assoc "#document-fragment" sections then
57 Some (get_section "#document-fragment" |> String.trim)
58 else None
59 in
60
61 {
62 input = data;
63 expected_tree = document;
64 expected_errors = errors;
65 script_on;
66 fragment_context = fragment;
67 raw_lines;
68 }
69
70(* Parse a .dat file into test cases *)
71let parse_dat_file content =
72 let lines = String.split_on_char '\n' content in
73 (* Split on empty lines followed by #data *)
74 let rec split_tests current acc = function
75 | [] ->
76 if current = [] then List.rev acc
77 else List.rev (List.rev current :: acc)
78 | "" :: "#data" :: rest ->
79 (* End of current test, start new one *)
80 let new_acc = if current = [] then acc else (List.rev current :: acc) in
81 split_tests ["#data"] new_acc rest
82 | line :: rest ->
83 split_tests (line :: current) acc rest
84 in
85 let test_groups = split_tests [] [] lines in
86 List.filter_map (fun lines ->
87 if List.exists (fun l -> l = "#data") lines then
88 Some (parse_test_case lines)
89 else None
90 ) test_groups
91
92(* Strip "| " prefix from each line *)
93let strip_tree_prefix s =
94 let lines = String.split_on_char '\n' s in
95 let stripped = List.filter_map (fun line ->
96 if String.length line >= 2 && String.sub line 0 2 = "| " then
97 Some (String.sub line 2 (String.length line - 2))
98 else if String.trim line = "" then None
99 else Some line
100 ) lines in
101 String.concat "\n" stripped
102
103(* Normalize tree output for comparison *)
104let normalize_tree s =
105 let lines = String.split_on_char '\n' s in
106 let non_empty = List.filter (fun l -> String.trim l <> "") lines in
107 String.concat "\n" non_empty
108
109let run_test test =
110 try
111 let result =
112 match test.fragment_context with
113 | Some ctx_str ->
114 (* Parse "namespace element" or just "element" *)
115 let (namespace, tag_name) =
116 match String.split_on_char ' ' ctx_str with
117 | [ns; tag] when ns = "svg" -> (Some "svg", tag)
118 | [ns; tag] when ns = "math" -> (Some "mathml", tag)
119 | [tag] -> (None, tag)
120 | _ -> (None, ctx_str)
121 in
122 let context = Html5rw.Parser.make_fragment_context ~tag_name ~namespace () in
123 let reader = Bytes.Reader.of_string test.input in
124 Html5rw.Parser.parse ~collect_errors:true ~fragment_context:context reader
125 | None ->
126 let reader = Bytes.Reader.of_string test.input in
127 Html5rw.Parser.parse ~collect_errors:true reader
128 in
129 let actual_tree = Html5rw.Dom.to_test_format (Html5rw.Parser.root result) in
130 let expected = normalize_tree (strip_tree_prefix test.expected_tree) in
131 let actual = normalize_tree (strip_tree_prefix actual_tree) in
132 let error_count = List.length (Html5rw.Parser.errors result) in
133 let expected_error_count = List.length test.expected_errors in
134 (expected = actual, expected, actual, error_count, expected_error_count)
135 with e ->
136 let expected = normalize_tree (strip_tree_prefix test.expected_tree) in
137 (false, expected, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), 0, 0)
138
139let run_file path =
140 let ic = open_in path in
141 let content = really_input_string ic (in_channel_length ic) in
142 close_in ic;
143
144 let tests = parse_dat_file content in
145 let filename = Filename.basename path in
146
147 let passed = ref 0 in
148 let failed = ref 0 in
149 let results = ref [] in
150
151 List.iteri (fun i test ->
152 (* Skip script-on tests since we don't support scripting *)
153 if test.script_on then
154 () (* Skip this test *)
155 else begin
156 let (success, expected, actual, actual_error_count, expected_error_count) = run_test test in
157 let description =
158 let input_preview = String.sub test.input 0 (min 60 (String.length test.input)) in
159 if test.fragment_context <> None then
160 Printf.sprintf "Fragment (%s): %s" (Option.get test.fragment_context) input_preview
161 else
162 input_preview
163 in
164 let details = [
165 ("Fragment Context", Option.value test.fragment_context ~default:"(none)");
166 ("Expected Errors", string_of_int expected_error_count);
167 ("Actual Errors", string_of_int actual_error_count);
168 ] in
169 let result : Report.test_result = {
170 test_num = i + 1;
171 description;
172 input = test.input;
173 expected;
174 actual;
175 success;
176 details;
177 raw_test_data = Some test.raw_lines;
178 } in
179 results := result :: !results;
180 if success then incr passed else incr failed
181 end
182 ) tests;
183
184 let file_result : Report.file_result = {
185 filename;
186 test_type = "Tree Construction";
187 passed_count = !passed;
188 failed_count = !failed;
189 tests = List.rev !results;
190 } in
191 (file_result, !passed, !failed)
192
193let () =
194 let test_dir = Sys.argv.(1) in
195 let files = Sys.readdir test_dir |> Array.to_list in
196 let dat_files = List.filter (fun f ->
197 Filename.check_suffix f ".dat" &&
198 not (String.contains f '/') (* Skip subdirectories *)
199 ) files in
200
201 let total_passed = ref 0 in
202 let total_failed = ref 0 in
203 let file_results = ref [] in
204
205 List.iter (fun file ->
206 let path = Filename.concat test_dir file in
207 if Sys.is_directory path then () else begin
208 let (file_result, passed, failed) = run_file path in
209 total_passed := !total_passed + passed;
210 total_failed := !total_failed + failed;
211 file_results := file_result :: !file_results;
212 Printf.printf "%s: %d passed, %d failed\n" file passed failed
213 end
214 ) (List.sort String.compare dat_files);
215
216 Printf.printf "\n=== Summary ===\n";
217 Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed;
218
219 (* Generate HTML report *)
220 let report : Report.report = {
221 title = "HTML5 Tree Construction Tests";
222 test_type = "tree-construction";
223 description = "These tests validate the HTML5 tree construction algorithm as specified in the WHATWG HTML Standard. \
224 Each test provides HTML input and the expected DOM tree structure. The parser processes the HTML and \
225 builds a document tree, which is then serialized and compared against the expected output. \
226 Tests cover various edge cases including malformed HTML, implicit element creation, foster parenting, \
227 adoption agency algorithm, and foreign content (SVG/MathML). Fragment parsing tests verify parsing \
228 in the context of specific elements.";
229 files = List.rev !file_results;
230 total_passed = !total_passed;
231 total_failed = !total_failed;
232 match_quality = None;
233 test_type_breakdown = None;
234 strictness_mode = None;
235 run_timestamp = None;
236 } in
237 Report.generate_report report "test_html5lib_report.html";
238
239 exit (if !total_failed > 0 then 1 else 0)