OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 15 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: ISC 4 ---------------------------------------------------------------------------*) 5 6(** Browser-compatible test runner for html5rw regression tests. 7 8 This module provides functions to run html5lib conformance tests 9 in the browser, receiving test data as strings and returning results 10 as JavaScript-accessible objects. *) 11 12[@@@warning "-69-33"] (* Silence unused-field and unused-open warnings *) 13 14open Brr 15 16(* ============================================================ *) 17(* Test Result Types *) 18(* ============================================================ *) 19 20type test_result = { 21 test_num : int; 22 description : string; 23 input : string; 24 expected : string; 25 actual : string; 26 success : bool; 27} 28 29type file_result = { 30 filename : string; 31 test_type : string; 32 passed_count : int; 33 failed_count : int; 34 tests : test_result list; 35} 36 37type suite_result = { 38 name : string; 39 total_passed : int; 40 total_failed : int; 41 files : file_result list; 42} 43 44(* ============================================================ *) 45(* Tree Construction Tests *) 46(* ============================================================ *) 47 48module TreeConstruction = struct 49 type test_case = { 50 input : string; 51 expected_tree : string; 52 expected_errors : string list; 53 script_on : bool; 54 fragment_context : string option; 55 } 56 57 let parse_test_case lines = 58 let rec parse acc = function 59 | [] -> acc 60 | line :: rest when String.length line > 0 && line.[0] = '#' -> 61 let section = String.trim line in 62 let content, remaining = collect_section rest in 63 parse ((section, content) :: acc) remaining 64 | _ :: rest -> parse acc rest 65 and collect_section lines = 66 let rec loop acc = function 67 | [] -> (List.rev acc, []) 68 | line :: rest when String.length line > 0 && line.[0] = '#' -> 69 (List.rev acc, line :: rest) 70 | line :: rest -> loop (line :: acc) rest 71 in 72 loop [] lines 73 in 74 let sections = parse [] lines in 75 let get_section name = 76 match List.assoc_opt name sections with 77 | Some lines -> String.concat "\n" lines 78 | None -> "" 79 in 80 let data = get_section "#data" in 81 let document = get_section "#document" in 82 let errors_text = get_section "#errors" in 83 let errors = 84 String.split_on_char '\n' errors_text 85 |> List.filter (fun s -> String.trim s <> "") 86 in 87 let script_on = List.mem_assoc "#script-on" sections in 88 let fragment = 89 if List.mem_assoc "#document-fragment" sections then 90 Some (get_section "#document-fragment" |> String.trim) 91 else None 92 in 93 { input = data; expected_tree = document; expected_errors = errors; 94 script_on; fragment_context = fragment } 95 96 let parse_dat_content content = 97 let lines = String.split_on_char '\n' content in 98 let rec split_tests current acc = function 99 | [] -> 100 if current = [] then List.rev acc 101 else List.rev (List.rev current :: acc) 102 | "" :: "#data" :: rest -> 103 let new_acc = if current = [] then acc else (List.rev current :: acc) in 104 split_tests ["#data"] new_acc rest 105 | line :: rest -> 106 split_tests (line :: current) acc rest 107 in 108 let test_groups = split_tests [] [] lines in 109 List.filter_map (fun lines -> 110 if List.exists (fun l -> l = "#data") lines then 111 Some (parse_test_case lines) 112 else None 113 ) test_groups 114 115 let strip_tree_prefix s = 116 let lines = String.split_on_char '\n' s in 117 let stripped = List.filter_map (fun line -> 118 if String.length line >= 2 && String.sub line 0 2 = "| " then 119 Some (String.sub line 2 (String.length line - 2)) 120 else if String.trim line = "" then None 121 else Some line 122 ) lines in 123 String.concat "\n" stripped 124 125 let normalize_tree s = 126 let lines = String.split_on_char '\n' s in 127 let non_empty = List.filter (fun l -> String.trim l <> "") lines in 128 String.concat "\n" non_empty 129 130 let run_test test = 131 try 132 let result = 133 match test.fragment_context with 134 | Some ctx_str -> 135 let (namespace, tag_name) = 136 match String.split_on_char ' ' ctx_str with 137 | [ns; tag] when ns = "svg" -> (Some "svg", tag) 138 | [ns; tag] when ns = "math" -> (Some "mathml", tag) 139 | [tag] -> (None, tag) 140 | _ -> (None, ctx_str) 141 in 142 let context = Html5rw.Parser.make_fragment_context ~tag_name ~namespace () in 143 let reader = Bytesrw.Bytes.Reader.of_string test.input in 144 Html5rw.Parser.parse ~collect_errors:true ~fragment_context:context reader 145 | None -> 146 let reader = Bytesrw.Bytes.Reader.of_string test.input in 147 Html5rw.Parser.parse ~collect_errors:true reader 148 in 149 let actual_tree = Html5rw.Dom.to_test_format (Html5rw.Parser.root result) in 150 let expected = normalize_tree (strip_tree_prefix test.expected_tree) in 151 let actual = normalize_tree (strip_tree_prefix actual_tree) in 152 (expected = actual, expected, actual) 153 with e -> 154 let expected = normalize_tree (strip_tree_prefix test.expected_tree) in 155 (false, expected, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e)) 156 157 let run_content ~filename content = 158 let tests = parse_dat_content content in 159 let passed = ref 0 in 160 let failed = ref 0 in 161 let results = ref [] in 162 List.iteri (fun i test -> 163 if test.script_on then () 164 else begin 165 let (success, expected, actual) = run_test test in 166 let description = 167 let input_preview = String.sub test.input 0 (min 60 (String.length test.input)) in 168 if test.fragment_context <> None then 169 Printf.sprintf "Fragment (%s): %s" (Option.get test.fragment_context) input_preview 170 else input_preview 171 in 172 let result = { 173 test_num = i + 1; description; input = test.input; expected; actual; success; 174 } in 175 results := result :: !results; 176 if success then incr passed else incr failed 177 end 178 ) tests; 179 { 180 filename; test_type = "Tree Construction"; 181 passed_count = !passed; failed_count = !failed; 182 tests = List.rev !results; 183 } 184end 185 186(* ============================================================ *) 187(* Encoding Tests *) 188(* ============================================================ *) 189 190module EncodingTests = struct 191 type test_case = { 192 input : string; 193 expected_encoding : string; 194 } 195 196 let normalize_encoding_name s = String.lowercase_ascii (String.trim s) 197 198 let encoding_to_test_name = function 199 | Html5rw.Encoding.Utf8 -> "utf-8" 200 | Html5rw.Encoding.Utf16le -> "utf-16le" 201 | Html5rw.Encoding.Utf16be -> "utf-16be" 202 | Html5rw.Encoding.Windows_1252 -> "windows-1252" 203 | Html5rw.Encoding.Iso_8859_2 -> "iso-8859-2" 204 | Html5rw.Encoding.Euc_jp -> "euc-jp" 205 206 let parse_test_case lines = 207 let rec parse acc = function 208 | [] -> acc 209 | line :: rest when String.length line > 0 && line.[0] = '#' -> 210 let section = String.trim line in 211 let content, remaining = collect_section rest in 212 parse ((section, content) :: acc) remaining 213 | _ :: rest -> parse acc rest 214 and collect_section lines = 215 let rec loop acc = function 216 | [] -> (List.rev acc, []) 217 | line :: rest when String.length line > 0 && line.[0] = '#' -> 218 (List.rev acc, line :: rest) 219 | line :: rest -> loop (line :: acc) rest 220 in loop [] lines 221 in 222 let sections = parse [] lines in 223 let get_section name = 224 match List.assoc_opt name sections with 225 | Some lines -> String.concat "\n" lines | None -> "" 226 in 227 let data = get_section "#data" in 228 let encoding = get_section "#encoding" in 229 { input = data; expected_encoding = String.trim encoding } 230 231 let parse_dat_content content = 232 let lines = String.split_on_char '\n' content in 233 let rec split_tests current acc = function 234 | [] -> if current = [] then List.rev acc else List.rev (List.rev current :: acc) 235 | "" :: "#data" :: rest -> 236 let new_acc = if current = [] then acc else (List.rev current :: acc) in 237 split_tests ["#data"] new_acc rest 238 | line :: rest -> split_tests (line :: current) acc rest 239 in 240 let test_groups = split_tests [] [] lines in 241 List.filter_map (fun lines -> 242 if List.exists (fun l -> l = "#data") lines then Some (parse_test_case lines) 243 else None 244 ) test_groups 245 246 let run_test test = 247 try 248 let (_, detected_encoding) = Html5rw.Encoding.decode (Bytes.of_string test.input) () in 249 let detected_name = encoding_to_test_name detected_encoding in 250 let expected_name = normalize_encoding_name test.expected_encoding in 251 let match_encoding det exp = 252 det = exp || 253 (det = "windows-1252" && (exp = "windows-1252" || exp = "cp1252" || exp = "iso-8859-1")) || 254 (det = "iso-8859-2" && (exp = "iso-8859-2" || exp = "iso8859-2" || exp = "latin2")) || 255 (det = "utf-8" && (exp = "utf-8" || exp = "utf8")) || 256 (det = "euc-jp" && (exp = "euc-jp" || exp = "eucjp")) 257 in 258 (match_encoding detected_name expected_name, detected_name, expected_name) 259 with e -> 260 (false, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), test.expected_encoding) 261 262 let run_content ~filename content = 263 let tests = parse_dat_content content in 264 let passed = ref 0 in 265 let failed = ref 0 in 266 let results = ref [] in 267 List.iteri (fun i test -> 268 if String.trim test.expected_encoding = "" then () 269 else begin 270 let (success, detected, expected) = run_test test in 271 let result = { 272 test_num = i + 1; 273 description = Printf.sprintf "Detect %s encoding" expected; 274 input = String.escaped test.input; 275 expected; actual = detected; success; 276 } in 277 results := result :: !results; 278 if success then incr passed else incr failed 279 end 280 ) tests; 281 { 282 filename; test_type = "Encoding Detection"; 283 passed_count = !passed; failed_count = !failed; 284 tests = List.rev !results; 285 } 286end 287 288(* ============================================================ *) 289(* JavaScript API *) 290(* ============================================================ *) 291 292let test_result_to_jv (r : test_result) = 293 Jv.obj [| 294 "testNum", Jv.of_int r.test_num; 295 "description", Jv.of_string r.description; 296 "input", Jv.of_string r.input; 297 "expected", Jv.of_string r.expected; 298 "actual", Jv.of_string r.actual; 299 "success", Jv.of_bool r.success; 300 |] 301 302let file_result_to_jv (r : file_result) = 303 Jv.obj [| 304 "filename", Jv.of_string r.filename; 305 "testType", Jv.of_string r.test_type; 306 "passedCount", Jv.of_int r.passed_count; 307 "failedCount", Jv.of_int r.failed_count; 308 "tests", Jv.of_list test_result_to_jv r.tests; 309 |] 310 311let suite_result_to_jv (r : suite_result) = 312 Jv.obj [| 313 "name", Jv.of_string r.name; 314 "totalPassed", Jv.of_int r.total_passed; 315 "totalFailed", Jv.of_int r.total_failed; 316 "files", Jv.of_list file_result_to_jv r.files; 317 |] 318 319(** Run tree construction tests on a single file's content *) 320let run_tree_construction_test filename content = 321 let result = TreeConstruction.run_content ~filename content in 322 file_result_to_jv result 323 324(** Run encoding detection tests on a single file's content *) 325let run_encoding_test filename content = 326 let result = EncodingTests.run_content ~filename content in 327 file_result_to_jv result 328 329(** Run all tests from provided test data *) 330let run_all_tests (test_files : (string * string * string) list) = 331 let tree_files = ref [] in 332 let encoding_files = ref [] in 333 let total_passed = ref 0 in 334 let total_failed = ref 0 in 335 336 List.iter (fun (test_type, filename, content) -> 337 let result = match test_type with 338 | "tree-construction" -> 339 let r = TreeConstruction.run_content ~filename content in 340 tree_files := r :: !tree_files; 341 r 342 | "encoding" -> 343 let r = EncodingTests.run_content ~filename content in 344 encoding_files := r :: !encoding_files; 345 r 346 | _ -> failwith ("Unknown test type: " ^ test_type) 347 in 348 total_passed := !total_passed + result.passed_count; 349 total_failed := !total_failed + result.failed_count 350 ) test_files; 351 352 let all_files = List.rev !tree_files @ List.rev !encoding_files in 353 let suite = { 354 name = "HTML5lib Regression Tests"; 355 total_passed = !total_passed; 356 total_failed = !total_failed; 357 files = all_files; 358 } in 359 suite_result_to_jv suite 360 361(* ============================================================ *) 362(* Simple Parser Test for Quick Validation *) 363(* ============================================================ *) 364 365let quick_parse_test html = 366 try 367 let reader = Bytesrw.Bytes.Reader.of_string html in 368 let result = Html5rw.Parser.parse ~collect_errors:true reader in 369 let root = Html5rw.Parser.root result in 370 let serialized = Html5rw.Dom.to_html root in 371 let errors = Html5rw.Parser.errors result in 372 let error_to_string e = Format.asprintf "%a" Html5rw.pp_parse_error e in 373 Jv.obj [| 374 "success", Jv.of_bool true; 375 "html", Jv.of_string serialized; 376 "errorCount", Jv.of_int (List.length errors); 377 "errors", Jv.of_list (fun e -> Jv.of_string (error_to_string e)) errors; 378 |] 379 with e -> 380 Jv.obj [| 381 "success", Jv.of_bool false; 382 "error", Jv.of_string (Printexc.to_string e); 383 |] 384 385(* ============================================================ *) 386(* Export to JavaScript *) 387(* ============================================================ *) 388 389let () = 390 let html5rw_tests = Jv.obj [| 391 "runTreeConstructionTest", Jv.callback ~arity:2 (fun filename content -> 392 run_tree_construction_test (Jv.to_string filename) (Jv.to_string content)); 393 "runEncodingTest", Jv.callback ~arity:2 (fun filename content -> 394 run_encoding_test (Jv.to_string filename) (Jv.to_string content)); 395 "runAllTests", Jv.callback ~arity:1 (fun files_jv -> 396 let files = Jv.to_list (fun item -> 397 let test_type = Jv.to_string (Jv.get item "type") in 398 let filename = Jv.to_string (Jv.get item "filename") in 399 let content = Jv.to_string (Jv.get item "content") in 400 (test_type, filename, content) 401 ) files_jv in 402 run_all_tests files); 403 "quickParseTest", Jv.callback ~arity:1 (fun html -> 404 quick_parse_test (Jv.to_string html)); 405 "version", Jv.of_string "1.0.0"; 406 |] in 407 Jv.set Jv.global "html5rwTests" html5rw_tests