OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: ISC
4 ---------------------------------------------------------------------------*)
5
6(** Browser-compatible test runner for html5rw regression tests.
7
8 This module provides functions to run html5lib conformance tests
9 in the browser, receiving test data as strings and returning results
10 as JavaScript-accessible objects. *)
11
12[@@@warning "-69-33"] (* Silence unused-field and unused-open warnings *)
13
14open Brr
15
16(* ============================================================ *)
17(* Test Result Types *)
18(* ============================================================ *)
19
20type test_result = {
21 test_num : int;
22 description : string;
23 input : string;
24 expected : string;
25 actual : string;
26 success : bool;
27}
28
29type file_result = {
30 filename : string;
31 test_type : string;
32 passed_count : int;
33 failed_count : int;
34 tests : test_result list;
35}
36
37type suite_result = {
38 name : string;
39 total_passed : int;
40 total_failed : int;
41 files : file_result list;
42}
43
44(* ============================================================ *)
45(* Tree Construction Tests *)
46(* ============================================================ *)
47
48module TreeConstruction = struct
49 type test_case = {
50 input : string;
51 expected_tree : string;
52 expected_errors : string list;
53 script_on : bool;
54 fragment_context : string option;
55 }
56
57 let parse_test_case lines =
58 let rec parse acc = function
59 | [] -> acc
60 | line :: rest when String.length line > 0 && line.[0] = '#' ->
61 let section = String.trim line in
62 let content, remaining = collect_section rest in
63 parse ((section, content) :: acc) remaining
64 | _ :: rest -> parse acc rest
65 and collect_section lines =
66 let rec loop acc = function
67 | [] -> (List.rev acc, [])
68 | line :: rest when String.length line > 0 && line.[0] = '#' ->
69 (List.rev acc, line :: rest)
70 | line :: rest -> loop (line :: acc) rest
71 in
72 loop [] lines
73 in
74 let sections = parse [] lines in
75 let get_section name =
76 match List.assoc_opt name sections with
77 | Some lines -> String.concat "\n" lines
78 | None -> ""
79 in
80 let data = get_section "#data" in
81 let document = get_section "#document" in
82 let errors_text = get_section "#errors" in
83 let errors =
84 String.split_on_char '\n' errors_text
85 |> List.filter (fun s -> String.trim s <> "")
86 in
87 let script_on = List.mem_assoc "#script-on" sections in
88 let fragment =
89 if List.mem_assoc "#document-fragment" sections then
90 Some (get_section "#document-fragment" |> String.trim)
91 else None
92 in
93 { input = data; expected_tree = document; expected_errors = errors;
94 script_on; fragment_context = fragment }
95
96 let parse_dat_content content =
97 let lines = String.split_on_char '\n' content in
98 let rec split_tests current acc = function
99 | [] ->
100 if current = [] then List.rev acc
101 else List.rev (List.rev current :: acc)
102 | "" :: "#data" :: rest ->
103 let new_acc = if current = [] then acc else (List.rev current :: acc) in
104 split_tests ["#data"] new_acc rest
105 | line :: rest ->
106 split_tests (line :: current) acc rest
107 in
108 let test_groups = split_tests [] [] lines in
109 List.filter_map (fun lines ->
110 if List.exists (fun l -> l = "#data") lines then
111 Some (parse_test_case lines)
112 else None
113 ) test_groups
114
115 let strip_tree_prefix s =
116 let lines = String.split_on_char '\n' s in
117 let stripped = List.filter_map (fun line ->
118 if String.length line >= 2 && String.sub line 0 2 = "| " then
119 Some (String.sub line 2 (String.length line - 2))
120 else if String.trim line = "" then None
121 else Some line
122 ) lines in
123 String.concat "\n" stripped
124
125 let normalize_tree s =
126 let lines = String.split_on_char '\n' s in
127 let non_empty = List.filter (fun l -> String.trim l <> "") lines in
128 String.concat "\n" non_empty
129
130 let run_test test =
131 try
132 let result =
133 match test.fragment_context with
134 | Some ctx_str ->
135 let (namespace, tag_name) =
136 match String.split_on_char ' ' ctx_str with
137 | [ns; tag] when ns = "svg" -> (Some "svg", tag)
138 | [ns; tag] when ns = "math" -> (Some "mathml", tag)
139 | [tag] -> (None, tag)
140 | _ -> (None, ctx_str)
141 in
142 let context = Html5rw.Parser.make_fragment_context ~tag_name ~namespace () in
143 let reader = Bytesrw.Bytes.Reader.of_string test.input in
144 Html5rw.Parser.parse ~collect_errors:true ~fragment_context:context reader
145 | None ->
146 let reader = Bytesrw.Bytes.Reader.of_string test.input in
147 Html5rw.Parser.parse ~collect_errors:true reader
148 in
149 let actual_tree = Html5rw.Dom.to_test_format (Html5rw.Parser.root result) in
150 let expected = normalize_tree (strip_tree_prefix test.expected_tree) in
151 let actual = normalize_tree (strip_tree_prefix actual_tree) in
152 (expected = actual, expected, actual)
153 with e ->
154 let expected = normalize_tree (strip_tree_prefix test.expected_tree) in
155 (false, expected, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e))
156
157 let run_content ~filename content =
158 let tests = parse_dat_content content in
159 let passed = ref 0 in
160 let failed = ref 0 in
161 let results = ref [] in
162 List.iteri (fun i test ->
163 if test.script_on then ()
164 else begin
165 let (success, expected, actual) = run_test test in
166 let description =
167 let input_preview = String.sub test.input 0 (min 60 (String.length test.input)) in
168 if test.fragment_context <> None then
169 Printf.sprintf "Fragment (%s): %s" (Option.get test.fragment_context) input_preview
170 else input_preview
171 in
172 let result = {
173 test_num = i + 1; description; input = test.input; expected; actual; success;
174 } in
175 results := result :: !results;
176 if success then incr passed else incr failed
177 end
178 ) tests;
179 {
180 filename; test_type = "Tree Construction";
181 passed_count = !passed; failed_count = !failed;
182 tests = List.rev !results;
183 }
184end
185
186(* ============================================================ *)
187(* Encoding Tests *)
188(* ============================================================ *)
189
190module EncodingTests = struct
191 type test_case = {
192 input : string;
193 expected_encoding : string;
194 }
195
196 let normalize_encoding_name s = String.lowercase_ascii (String.trim s)
197
198 let encoding_to_test_name = function
199 | Html5rw.Encoding.Utf8 -> "utf-8"
200 | Html5rw.Encoding.Utf16le -> "utf-16le"
201 | Html5rw.Encoding.Utf16be -> "utf-16be"
202 | Html5rw.Encoding.Windows_1252 -> "windows-1252"
203 | Html5rw.Encoding.Iso_8859_2 -> "iso-8859-2"
204 | Html5rw.Encoding.Euc_jp -> "euc-jp"
205
206 let parse_test_case lines =
207 let rec parse acc = function
208 | [] -> acc
209 | line :: rest when String.length line > 0 && line.[0] = '#' ->
210 let section = String.trim line in
211 let content, remaining = collect_section rest in
212 parse ((section, content) :: acc) remaining
213 | _ :: rest -> parse acc rest
214 and collect_section lines =
215 let rec loop acc = function
216 | [] -> (List.rev acc, [])
217 | line :: rest when String.length line > 0 && line.[0] = '#' ->
218 (List.rev acc, line :: rest)
219 | line :: rest -> loop (line :: acc) rest
220 in loop [] lines
221 in
222 let sections = parse [] lines in
223 let get_section name =
224 match List.assoc_opt name sections with
225 | Some lines -> String.concat "\n" lines | None -> ""
226 in
227 let data = get_section "#data" in
228 let encoding = get_section "#encoding" in
229 { input = data; expected_encoding = String.trim encoding }
230
231 let parse_dat_content content =
232 let lines = String.split_on_char '\n' content in
233 let rec split_tests current acc = function
234 | [] -> if current = [] then List.rev acc else List.rev (List.rev current :: acc)
235 | "" :: "#data" :: rest ->
236 let new_acc = if current = [] then acc else (List.rev current :: acc) in
237 split_tests ["#data"] new_acc rest
238 | line :: rest -> split_tests (line :: current) acc rest
239 in
240 let test_groups = split_tests [] [] lines in
241 List.filter_map (fun lines ->
242 if List.exists (fun l -> l = "#data") lines then Some (parse_test_case lines)
243 else None
244 ) test_groups
245
246 let run_test test =
247 try
248 let (_, detected_encoding) = Html5rw.Encoding.decode (Bytes.of_string test.input) () in
249 let detected_name = encoding_to_test_name detected_encoding in
250 let expected_name = normalize_encoding_name test.expected_encoding in
251 let match_encoding det exp =
252 det = exp ||
253 (det = "windows-1252" && (exp = "windows-1252" || exp = "cp1252" || exp = "iso-8859-1")) ||
254 (det = "iso-8859-2" && (exp = "iso-8859-2" || exp = "iso8859-2" || exp = "latin2")) ||
255 (det = "utf-8" && (exp = "utf-8" || exp = "utf8")) ||
256 (det = "euc-jp" && (exp = "euc-jp" || exp = "eucjp"))
257 in
258 (match_encoding detected_name expected_name, detected_name, expected_name)
259 with e ->
260 (false, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), test.expected_encoding)
261
262 let run_content ~filename content =
263 let tests = parse_dat_content content in
264 let passed = ref 0 in
265 let failed = ref 0 in
266 let results = ref [] in
267 List.iteri (fun i test ->
268 if String.trim test.expected_encoding = "" then ()
269 else begin
270 let (success, detected, expected) = run_test test in
271 let result = {
272 test_num = i + 1;
273 description = Printf.sprintf "Detect %s encoding" expected;
274 input = String.escaped test.input;
275 expected; actual = detected; success;
276 } in
277 results := result :: !results;
278 if success then incr passed else incr failed
279 end
280 ) tests;
281 {
282 filename; test_type = "Encoding Detection";
283 passed_count = !passed; failed_count = !failed;
284 tests = List.rev !results;
285 }
286end
287
288(* ============================================================ *)
289(* JavaScript API *)
290(* ============================================================ *)
291
292let test_result_to_jv (r : test_result) =
293 Jv.obj [|
294 "testNum", Jv.of_int r.test_num;
295 "description", Jv.of_string r.description;
296 "input", Jv.of_string r.input;
297 "expected", Jv.of_string r.expected;
298 "actual", Jv.of_string r.actual;
299 "success", Jv.of_bool r.success;
300 |]
301
302let file_result_to_jv (r : file_result) =
303 Jv.obj [|
304 "filename", Jv.of_string r.filename;
305 "testType", Jv.of_string r.test_type;
306 "passedCount", Jv.of_int r.passed_count;
307 "failedCount", Jv.of_int r.failed_count;
308 "tests", Jv.of_list test_result_to_jv r.tests;
309 |]
310
311let suite_result_to_jv (r : suite_result) =
312 Jv.obj [|
313 "name", Jv.of_string r.name;
314 "totalPassed", Jv.of_int r.total_passed;
315 "totalFailed", Jv.of_int r.total_failed;
316 "files", Jv.of_list file_result_to_jv r.files;
317 |]
318
319(** Run tree construction tests on a single file's content *)
320let run_tree_construction_test filename content =
321 let result = TreeConstruction.run_content ~filename content in
322 file_result_to_jv result
323
324(** Run encoding detection tests on a single file's content *)
325let run_encoding_test filename content =
326 let result = EncodingTests.run_content ~filename content in
327 file_result_to_jv result
328
329(** Run all tests from provided test data *)
330let run_all_tests (test_files : (string * string * string) list) =
331 let tree_files = ref [] in
332 let encoding_files = ref [] in
333 let total_passed = ref 0 in
334 let total_failed = ref 0 in
335
336 List.iter (fun (test_type, filename, content) ->
337 let result = match test_type with
338 | "tree-construction" ->
339 let r = TreeConstruction.run_content ~filename content in
340 tree_files := r :: !tree_files;
341 r
342 | "encoding" ->
343 let r = EncodingTests.run_content ~filename content in
344 encoding_files := r :: !encoding_files;
345 r
346 | _ -> failwith ("Unknown test type: " ^ test_type)
347 in
348 total_passed := !total_passed + result.passed_count;
349 total_failed := !total_failed + result.failed_count
350 ) test_files;
351
352 let all_files = List.rev !tree_files @ List.rev !encoding_files in
353 let suite = {
354 name = "HTML5lib Regression Tests";
355 total_passed = !total_passed;
356 total_failed = !total_failed;
357 files = all_files;
358 } in
359 suite_result_to_jv suite
360
361(* ============================================================ *)
362(* Simple Parser Test for Quick Validation *)
363(* ============================================================ *)
364
365let quick_parse_test html =
366 try
367 let reader = Bytesrw.Bytes.Reader.of_string html in
368 let result = Html5rw.Parser.parse ~collect_errors:true reader in
369 let root = Html5rw.Parser.root result in
370 let serialized = Html5rw.Dom.to_html root in
371 let errors = Html5rw.Parser.errors result in
372 let error_to_string e = Format.asprintf "%a" Html5rw.pp_parse_error e in
373 Jv.obj [|
374 "success", Jv.of_bool true;
375 "html", Jv.of_string serialized;
376 "errorCount", Jv.of_int (List.length errors);
377 "errors", Jv.of_list (fun e -> Jv.of_string (error_to_string e)) errors;
378 |]
379 with e ->
380 Jv.obj [|
381 "success", Jv.of_bool false;
382 "error", Jv.of_string (Printexc.to_string e);
383 |]
384
385(* ============================================================ *)
386(* Export to JavaScript *)
387(* ============================================================ *)
388
389let () =
390 let html5rw_tests = Jv.obj [|
391 "runTreeConstructionTest", Jv.callback ~arity:2 (fun filename content ->
392 run_tree_construction_test (Jv.to_string filename) (Jv.to_string content));
393 "runEncodingTest", Jv.callback ~arity:2 (fun filename content ->
394 run_encoding_test (Jv.to_string filename) (Jv.to_string content));
395 "runAllTests", Jv.callback ~arity:1 (fun files_jv ->
396 let files = Jv.to_list (fun item ->
397 let test_type = Jv.to_string (Jv.get item "type") in
398 let filename = Jv.to_string (Jv.get item "filename") in
399 let content = Jv.to_string (Jv.get item "content") in
400 (test_type, filename, content)
401 ) files_jv in
402 run_all_tests files);
403 "quickParseTest", Jv.callback ~arity:1 (fun html ->
404 quick_parse_test (Jv.to_string html));
405 "version", Jv.of_string "1.0.0";
406 |] in
407 Jv.set Jv.global "html5rwTests" html5rw_tests