OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* Combined test runner for all html5lib-tests *) 2(* Generates a single standalone HTML report *) 3 4open Bytesrw 5 6module Report = Test_report 7 8(* ============================================================ *) 9(* Tree Construction Tests *) 10(* ============================================================ *) 11 12module TreeConstruction = struct 13 module Parser = Html5rw.Parser 14 module Dom = Html5rw.Dom 15 16 type test_case = { 17 input : string; 18 expected_tree : string; 19 expected_errors : string list; 20 script_on : bool; 21 fragment_context : string option; 22 raw_lines : string; 23 } 24 25 let parse_test_case lines = 26 let raw_lines = String.concat "\n" lines in 27 let rec parse acc = function 28 | [] -> acc 29 | line :: rest when String.length line > 0 && line.[0] = '#' -> 30 let section = String.trim line in 31 let content, remaining = collect_section rest in 32 parse ((section, content) :: acc) remaining 33 | _ :: rest -> parse acc rest 34 and collect_section lines = 35 let rec loop acc = function 36 | [] -> (List.rev acc, []) 37 | line :: rest when String.length line > 0 && line.[0] = '#' -> 38 (List.rev acc, line :: rest) 39 | line :: rest -> loop (line :: acc) rest 40 in 41 loop [] lines 42 in 43 let sections = parse [] lines in 44 let get_section name = 45 match List.assoc_opt name sections with 46 | Some lines -> String.concat "\n" lines 47 | None -> "" 48 in 49 let data = get_section "#data" in 50 let document = get_section "#document" in 51 let errors_text = get_section "#errors" in 52 let errors = 53 String.split_on_char '\n' errors_text 54 |> List.filter (fun s -> String.trim s <> "") 55 in 56 let script_on = List.mem_assoc "#script-on" sections in 57 let fragment = 58 if List.mem_assoc "#document-fragment" sections then 59 Some (get_section "#document-fragment" |> String.trim) 60 else None 61 in 62 { input = data; expected_tree = document; expected_errors = errors; 63 script_on; fragment_context = fragment; raw_lines } 64 65 let parse_dat_file content = 66 let lines = String.split_on_char '\n' content in 67 let rec split_tests current acc = function 68 | [] -> 69 if current = [] then List.rev acc 70 else List.rev (List.rev current :: acc) 71 | "" :: "#data" :: rest -> 72 let new_acc = if current = [] then acc else (List.rev current :: acc) in 73 split_tests ["#data"] new_acc rest 74 | line :: rest -> 75 split_tests (line :: current) acc rest 76 in 77 let test_groups = split_tests [] [] lines in 78 List.filter_map (fun lines -> 79 if List.exists (fun l -> l = "#data") lines then 80 Some (parse_test_case lines) 81 else None 82 ) test_groups 83 84 let strip_tree_prefix s = 85 let lines = String.split_on_char '\n' s in 86 let stripped = List.filter_map (fun line -> 87 if String.length line >= 2 && String.sub line 0 2 = "| " then 88 Some (String.sub line 2 (String.length line - 2)) 89 else if String.trim line = "" then None 90 else Some line 91 ) lines in 92 String.concat "\n" stripped 93 94 let normalize_tree s = 95 let lines = String.split_on_char '\n' s in 96 let non_empty = List.filter (fun l -> String.trim l <> "") lines in 97 String.concat "\n" non_empty 98 99 let run_test test = 100 try 101 let result = 102 match test.fragment_context with 103 | Some ctx_str -> 104 let (namespace, tag_name) = 105 match String.split_on_char ' ' ctx_str with 106 | [ns; tag] when ns = "svg" -> (Some "svg", tag) 107 | [ns; tag] when ns = "math" -> (Some "mathml", tag) 108 | [tag] -> (None, tag) 109 | _ -> (None, ctx_str) 110 in 111 let context = Html5rw.Parser.make_fragment_context ~tag_name ~namespace () in 112 let reader = Bytes.Reader.of_string test.input in 113 Html5rw.Parser.parse ~collect_errors:true ~fragment_context:context reader 114 | None -> 115 let reader = Bytes.Reader.of_string test.input in 116 Html5rw.Parser.parse ~collect_errors:true reader 117 in 118 let actual_tree = Html5rw.Dom.to_test_format (Html5rw.Parser.root result) in 119 let expected = normalize_tree (strip_tree_prefix test.expected_tree) in 120 let actual = normalize_tree (strip_tree_prefix actual_tree) in 121 let error_count = List.length (Html5rw.Parser.errors result) in 122 let expected_error_count = List.length test.expected_errors in 123 (expected = actual, expected, actual, error_count, expected_error_count) 124 with e -> 125 let expected = normalize_tree (strip_tree_prefix test.expected_tree) in 126 (false, expected, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), 0, 0) 127 128 let run_file path = 129 let ic = open_in path in 130 let content = really_input_string ic (in_channel_length ic) in 131 close_in ic; 132 let tests = parse_dat_file content in 133 let filename = Filename.basename path in 134 let passed = ref 0 in 135 let failed = ref 0 in 136 let results = ref [] in 137 List.iteri (fun i test -> 138 if test.script_on then () 139 else begin 140 let (success, expected, actual, actual_error_count, expected_error_count) = run_test test in 141 let description = 142 let input_preview = String.sub test.input 0 (min 60 (String.length test.input)) in 143 if test.fragment_context <> None then 144 Printf.sprintf "Fragment (%s): %s" (Option.get test.fragment_context) input_preview 145 else input_preview 146 in 147 let result : Report.test_result = { 148 test_num = i + 1; description; input = test.input; expected; actual; success; 149 details = [ 150 ("Fragment Context", Option.value test.fragment_context ~default:"(none)"); 151 ("Expected Errors", string_of_int expected_error_count); 152 ("Actual Errors", string_of_int actual_error_count); 153 ]; 154 raw_test_data = Some test.raw_lines; 155 } in 156 results := result :: !results; 157 if success then incr passed else incr failed 158 end 159 ) tests; 160 let file_result : Report.file_result = { 161 filename; test_type = "Tree Construction"; 162 passed_count = !passed; failed_count = !failed; 163 tests = List.rev !results; 164 } in 165 (file_result, !passed, !failed) 166 167 let run_dir test_dir = 168 let files = Sys.readdir test_dir |> Array.to_list in 169 let dat_files = List.filter (fun f -> 170 Filename.check_suffix f ".dat" && not (String.contains f '/') 171 ) files in 172 let total_passed = ref 0 in 173 let total_failed = ref 0 in 174 let file_results = ref [] in 175 List.iter (fun file -> 176 let path = Filename.concat test_dir file in 177 if Sys.is_directory path then () else begin 178 let (file_result, passed, failed) = run_file path in 179 total_passed := !total_passed + passed; 180 total_failed := !total_failed + failed; 181 file_results := file_result :: !file_results; 182 Printf.printf " %s: %d passed, %d failed\n" file passed failed 183 end 184 ) (List.sort String.compare dat_files); 185 (List.rev !file_results, !total_passed, !total_failed) 186end 187 188(* ============================================================ *) 189(* Tokenizer Tests *) 190(* ============================================================ *) 191 192module Tokenizer_tests = struct 193 module Tokenizer = Html5rw.Tokenizer 194 195 module TokenCollector = struct 196 type t = { mutable tokens : Html5rw.Tokenizer.Token.t list } 197 let create () = { tokens = [] } 198 let process t token ~line:_ ~column:_ = t.tokens <- token :: t.tokens; `Continue 199 let adjusted_current_node_in_html_namespace _ = true 200 let get_tokens t = List.rev t.tokens 201 end 202 203 type test_case = { 204 description : string; 205 input : string; 206 output : Jsont.json list; 207 expected_error_count : int; 208 initial_states : string list; 209 last_start_tag : string option; 210 double_escaped : bool; 211 xml_mode : bool; 212 raw_json : string; 213 } 214 215 let unescape_double s = 216 let b = Buffer.create (String.length s) in 217 let i = ref 0 in 218 while !i < String.length s do 219 if !i + 1 < String.length s && s.[!i] = '\\' then begin 220 match s.[!i + 1] with 221 | 'u' when !i + 5 < String.length s -> 222 let hex = String.sub s (!i + 2) 4 in 223 (try 224 let code = int_of_string ("0x" ^ hex) in 225 if code < 128 then Buffer.add_char b (Char.chr code) 226 else begin 227 if code < 0x800 then begin 228 Buffer.add_char b (Char.chr (0xC0 lor (code lsr 6))); 229 Buffer.add_char b (Char.chr (0x80 lor (code land 0x3F))) 230 end else begin 231 Buffer.add_char b (Char.chr (0xE0 lor (code lsr 12))); 232 Buffer.add_char b (Char.chr (0x80 lor ((code lsr 6) land 0x3F))); 233 Buffer.add_char b (Char.chr (0x80 lor (code land 0x3F))) 234 end 235 end; 236 i := !i + 6 237 with _ -> Buffer.add_char b s.[!i]; incr i) 238 | 'n' -> Buffer.add_char b '\n'; i := !i + 2 239 | 'r' -> Buffer.add_char b '\r'; i := !i + 2 240 | 't' -> Buffer.add_char b '\t'; i := !i + 2 241 | '\\' -> Buffer.add_char b '\\'; i := !i + 2 242 | _ -> Buffer.add_char b s.[!i]; incr i 243 end else begin 244 Buffer.add_char b s.[!i]; incr i 245 end 246 done; 247 Buffer.contents b 248 249 let json_string = function Jsont.String (s, _) -> s | _ -> failwith "Expected string" 250 let json_bool = function Jsont.Bool (b, _) -> b | _ -> failwith "Expected bool" 251 let json_array = function Jsont.Array (arr, _) -> arr | _ -> failwith "Expected array" 252 let json_object = function Jsont.Object (obj, _) -> obj | _ -> failwith "Expected object" 253 254 let json_mem name obj = 255 match List.find_opt (fun ((n, _), _) -> n = name) obj with 256 | Some (_, v) -> Some v | None -> None 257 258 let json_mem_exn name obj = 259 match json_mem name obj with Some v -> v | None -> failwith ("Missing member: " ^ name) 260 261 let rec json_to_string = function 262 | Jsont.Null _ -> "null" 263 | Jsont.Bool (b, _) -> string_of_bool b 264 | Jsont.Number (n, _) -> Printf.sprintf "%g" n 265 | Jsont.String (s, _) -> Printf.sprintf "%S" s 266 | Jsont.Array (arr, _) -> "[" ^ String.concat ", " (List.map json_to_string arr) ^ "]" 267 | Jsont.Object (obj, _) -> 268 "{" ^ String.concat ", " (List.map (fun ((n, _), v) -> Printf.sprintf "%S: %s" n (json_to_string v)) obj) ^ "}" 269 270 let parse_test_case ~xml_mode json = 271 let raw_json = json_to_string json in 272 let obj = json_object json in 273 let description = json_string (json_mem_exn "description" obj) in 274 let input = json_string (json_mem_exn "input" obj) in 275 let output = json_array (json_mem_exn "output" obj) in 276 let expected_error_count = match json_mem "errors" obj with 277 | Some e -> List.length (json_array e) | None -> 0 278 in 279 let initial_states = match json_mem "initialStates" obj with 280 | Some s -> List.map json_string (json_array s) | None -> ["Data state"] 281 in 282 let last_start_tag = match json_mem "lastStartTag" obj with 283 | Some s -> Some (json_string s) | None -> None 284 in 285 let double_escaped = match json_mem "doubleEscaped" obj with 286 | Some b -> json_bool b | None -> false 287 in 288 { description; input; output; expected_error_count; initial_states; 289 last_start_tag; double_escaped; xml_mode; raw_json } 290 291 let state_of_string = function 292 | "Data state" -> Html5rw.Tokenizer.State.Data 293 | "PLAINTEXT state" -> Html5rw.Tokenizer.State.Plaintext 294 | "RCDATA state" -> Html5rw.Tokenizer.State.Rcdata 295 | "RAWTEXT state" -> Html5rw.Tokenizer.State.Rawtext 296 | "Script data state" -> Html5rw.Tokenizer.State.Script_data 297 | "CDATA section state" -> Html5rw.Tokenizer.State.Cdata_section 298 | s -> failwith ("Unknown state: " ^ s) 299 300 let token_to_test_json (tok : Html5rw.Tokenizer.Token.t) : Jsont.json list = 301 let str s = Jsont.String (s, Jsont.Meta.none) in 302 let arr l = Jsont.Array (l, Jsont.Meta.none) in 303 match tok with 304 | Html5rw.Tokenizer.Token.Doctype { name; public_id; system_id; force_quirks } -> 305 let name_json = match name with Some n -> str n | None -> Jsont.Null ((), Jsont.Meta.none) in 306 let public_json = match public_id with Some p -> str p | None -> Jsont.Null ((), Jsont.Meta.none) in 307 let system_json = match system_id with Some s -> str s | None -> Jsont.Null ((), Jsont.Meta.none) in 308 let correctness = Jsont.Bool (not force_quirks, Jsont.Meta.none) in 309 [arr [str "DOCTYPE"; name_json; public_json; system_json; correctness]] 310 | Html5rw.Tokenizer.Token.Tag { kind = Start; name; attrs; self_closing } -> 311 let attrs_obj = Jsont.Object ( 312 List.map (fun (n, v) -> ((n, Jsont.Meta.none), str v)) (List.rev attrs), 313 Jsont.Meta.none 314 ) in 315 if self_closing then [arr [str "StartTag"; str name; attrs_obj; Jsont.Bool (true, Jsont.Meta.none)]] 316 else [arr [str "StartTag"; str name; attrs_obj]] 317 | Html5rw.Tokenizer.Token.Tag { kind = End; name; _ } -> [arr [str "EndTag"; str name]] 318 | Html5rw.Tokenizer.Token.Comment data -> [arr [str "Comment"; str data]] 319 | Html5rw.Tokenizer.Token.Character data -> [arr [str "Character"; str data]] 320 | Html5rw.Tokenizer.Token.EOF -> [] 321 322 let rec json_equal a b = 323 match a, b with 324 | Jsont.Null _, Jsont.Null _ -> true 325 | Jsont.Bool (a, _), Jsont.Bool (b, _) -> a = b 326 | Jsont.Number (a, _), Jsont.Number (b, _) -> a = b 327 | Jsont.String (a, _), Jsont.String (b, _) -> a = b 328 | Jsont.Array (a, _), Jsont.Array (b, _) -> 329 List.length a = List.length b && List.for_all2 json_equal a b 330 | Jsont.Object (a, _), Jsont.Object (b, _) -> 331 let a_sorted = List.sort (fun ((n1, _), _) ((n2, _), _) -> String.compare n1 n2) a in 332 let b_sorted = List.sort (fun ((n1, _), _) ((n2, _), _) -> String.compare n1 n2) b in 333 List.length a_sorted = List.length b_sorted && 334 List.for_all2 (fun ((n1, _), v1) ((n2, _), v2) -> n1 = n2 && json_equal v1 v2) a_sorted b_sorted 335 | _ -> false 336 337 let merge_character_tokens tokens = 338 let rec loop acc = function 339 | [] -> List.rev acc 340 | Html5rw.Tokenizer.Token.Character s1 :: Html5rw.Tokenizer.Token.Character s2 :: rest -> 341 loop acc (Html5rw.Tokenizer.Token.Character (s1 ^ s2) :: rest) 342 | tok :: rest -> loop (tok :: acc) rest 343 in loop [] tokens 344 345 let run_test test initial_state = 346 let input = if test.double_escaped then unescape_double test.input else test.input in 347 let collector = TokenCollector.create () in 348 let tokenizer = Html5rw.Tokenizer.create (module TokenCollector) collector ~collect_errors:true ~xml_mode:test.xml_mode () in 349 Html5rw.Tokenizer.set_state tokenizer initial_state; 350 (match test.last_start_tag with Some tag -> Html5rw.Tokenizer.set_last_start_tag tokenizer tag | None -> ()); 351 let reader = Bytes.Reader.of_string input in 352 Html5rw.Tokenizer.run tokenizer (module TokenCollector) reader; 353 let tokens = merge_character_tokens (TokenCollector.get_tokens collector) in 354 let actual_tokens = List.concat_map token_to_test_json tokens in 355 let expected_output = if test.double_escaped then 356 let rec unescape_json = function 357 | Jsont.String (s, m) -> Jsont.String (unescape_double s, m) 358 | Jsont.Array (arr, m) -> Jsont.Array (List.map unescape_json arr, m) 359 | Jsont.Object (obj, m) -> Jsont.Object (List.map (fun (n, v) -> (n, unescape_json v)) obj, m) 360 | other -> other 361 in List.map unescape_json test.output 362 else test.output in 363 let rec merge_expected = function 364 | [] -> [] 365 | [x] -> [x] 366 | Jsont.Array ([Jsont.String ("Character", _); Jsont.String (s1, m1)], am1) :: 367 Jsont.Array ([Jsont.String ("Character", _); Jsont.String (s2, _)], _) :: rest -> 368 merge_expected (Jsont.Array ([Jsont.String ("Character", Jsont.Meta.none); Jsont.String (s1 ^ s2, m1)], am1) :: rest) 369 | x :: rest -> x :: merge_expected rest 370 in 371 let expected = merge_expected expected_output in 372 let tokens_match = 373 List.length actual_tokens = List.length expected && 374 List.for_all2 json_equal actual_tokens expected 375 in 376 let actual_error_count = List.length (Html5rw.Tokenizer.get_errors tokenizer) in 377 let errors_count_match = actual_error_count = test.expected_error_count in 378 (tokens_match && errors_count_match, actual_tokens, expected, actual_error_count, test.expected_error_count) 379 380 let run_file path = 381 let content = 382 let ic = open_in path in 383 let n = in_channel_length ic in 384 let s = really_input_string ic n in 385 close_in ic; s 386 in 387 let json = match Jsont_bytesrw.decode_string Jsont.json content with 388 | Ok j -> j 389 | Error e -> failwith (Printf.sprintf "JSON parse error in %s: %s" path e) 390 in 391 let obj = json_object json in 392 let regular_tests = match json_mem "tests" obj with 393 | Some t -> List.map (parse_test_case ~xml_mode:false) (json_array t) | None -> [] 394 in 395 let xml_tests = match json_mem "xmlViolationTests" obj with 396 | Some t -> List.map (parse_test_case ~xml_mode:true) (json_array t) | None -> [] 397 in 398 let all_tests = regular_tests @ xml_tests in 399 let filename = Filename.basename path in 400 let passed = ref 0 in 401 let failed = ref 0 in 402 let results = ref [] in 403 List.iteri (fun i test -> 404 List.iter (fun state_name -> 405 try 406 let state = state_of_string state_name in 407 let (success, actual, expected, actual_err_count, expected_err_count) = run_test test state in 408 let description = Printf.sprintf "[%s] %s" state_name test.description in 409 let result : Report.test_result = { 410 test_num = i + 1; description; input = test.input; 411 expected = String.concat "\n" (List.map json_to_string expected); 412 actual = String.concat "\n" (List.map json_to_string actual); 413 success; 414 details = [ 415 ("Initial State", state_name); 416 ("Last Start Tag", Option.value test.last_start_tag ~default:"(none)"); 417 ("Double Escaped", string_of_bool test.double_escaped); 418 ("XML Mode", string_of_bool test.xml_mode); 419 ("Expected Errors", string_of_int expected_err_count); 420 ("Actual Errors", string_of_int actual_err_count); 421 ]; 422 raw_test_data = Some test.raw_json; 423 } in 424 results := result :: !results; 425 if success then incr passed else incr failed 426 with e -> 427 incr failed; 428 let result : Report.test_result = { 429 test_num = i + 1; 430 description = Printf.sprintf "[%s] %s" state_name test.description; 431 input = test.input; expected = ""; 432 actual = Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e); 433 success = false; details = []; 434 raw_test_data = Some test.raw_json; 435 } in 436 results := result :: !results 437 ) test.initial_states 438 ) all_tests; 439 let file_result : Report.file_result = { 440 filename; test_type = "Tokenizer"; 441 passed_count = !passed; failed_count = !failed; 442 tests = List.rev !results; 443 } in 444 (file_result, !passed, !failed) 445 446 let run_dir test_dir = 447 let files = Sys.readdir test_dir |> Array.to_list in 448 let test_files = List.filter (fun f -> Filename.check_suffix f ".test") files in 449 let total_passed = ref 0 in 450 let total_failed = ref 0 in 451 let file_results = ref [] in 452 List.iter (fun file -> 453 let path = Filename.concat test_dir file in 454 let (file_result, passed, failed) = run_file path in 455 total_passed := !total_passed + passed; 456 total_failed := !total_failed + failed; 457 file_results := file_result :: !file_results; 458 Printf.printf " %s: %d passed, %d failed\n" file passed failed 459 ) (List.sort String.compare test_files); 460 (List.rev !file_results, !total_passed, !total_failed) 461end 462 463(* ============================================================ *) 464(* Encoding Tests *) 465(* ============================================================ *) 466 467module Encoding_tests = struct 468 module Encoding = Html5rw.Encoding 469 470 type test_case = { 471 input : string; 472 expected_encoding : string; 473 raw_lines : string; 474 } 475 476 let normalize_encoding_name s = String.lowercase_ascii (String.trim s) 477 478 let encoding_to_test_name = function 479 | Html5rw.Encoding.Utf8 -> "utf-8" 480 | Html5rw.Encoding.Utf16le -> "utf-16le" 481 | Html5rw.Encoding.Utf16be -> "utf-16be" 482 | Html5rw.Encoding.Windows_1252 -> "windows-1252" 483 | Html5rw.Encoding.Iso_8859_2 -> "iso-8859-2" 484 | Html5rw.Encoding.Euc_jp -> "euc-jp" 485 486 let parse_test_case lines = 487 let raw_lines = String.concat "\n" lines in 488 let rec parse acc = function 489 | [] -> acc 490 | line :: rest when String.length line > 0 && line.[0] = '#' -> 491 let section = String.trim line in 492 let content, remaining = collect_section rest in 493 parse ((section, content) :: acc) remaining 494 | _ :: rest -> parse acc rest 495 and collect_section lines = 496 let rec loop acc = function 497 | [] -> (List.rev acc, []) 498 | line :: rest when String.length line > 0 && line.[0] = '#' -> 499 (List.rev acc, line :: rest) 500 | line :: rest -> loop (line :: acc) rest 501 in loop [] lines 502 in 503 let sections = parse [] lines in 504 let get_section name = 505 match List.assoc_opt name sections with 506 | Some lines -> String.concat "\n" lines | None -> "" 507 in 508 let data = get_section "#data" in 509 let encoding = get_section "#encoding" in 510 { input = data; expected_encoding = String.trim encoding; raw_lines } 511 512 let parse_dat_file content = 513 let lines = String.split_on_char '\n' content in 514 let rec split_tests current acc = function 515 | [] -> if current = [] then List.rev acc else List.rev (List.rev current :: acc) 516 | "" :: "#data" :: rest -> 517 let new_acc = if current = [] then acc else (List.rev current :: acc) in 518 split_tests ["#data"] new_acc rest 519 | line :: rest -> split_tests (line :: current) acc rest 520 in 521 let test_groups = split_tests [] [] lines in 522 List.filter_map (fun lines -> 523 if List.exists (fun l -> l = "#data") lines then Some (parse_test_case lines) 524 else None 525 ) test_groups 526 527 let run_test test = 528 try 529 let (_, detected_encoding) = Html5rw.Encoding.decode (Bytes.of_string test.input) () in 530 let detected_name = encoding_to_test_name detected_encoding in 531 let expected_name = normalize_encoding_name test.expected_encoding in 532 let match_encoding det exp = 533 det = exp || 534 (det = "windows-1252" && (exp = "windows-1252" || exp = "cp1252" || exp = "iso-8859-1")) || 535 (det = "iso-8859-2" && (exp = "iso-8859-2" || exp = "iso8859-2" || exp = "latin2")) || 536 (det = "utf-8" && (exp = "utf-8" || exp = "utf8")) || 537 (det = "euc-jp" && (exp = "euc-jp" || exp = "eucjp")) 538 in 539 (match_encoding detected_name expected_name, detected_name, expected_name) 540 with e -> 541 (false, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), test.expected_encoding) 542 543 let run_file path = 544 let ic = open_in path in 545 let content = really_input_string ic (in_channel_length ic) in 546 close_in ic; 547 let tests = parse_dat_file content in 548 let filename = Filename.basename path in 549 let passed = ref 0 in 550 let failed = ref 0 in 551 let results = ref [] in 552 List.iteri (fun i test -> 553 if String.trim test.expected_encoding = "" then () 554 else begin 555 let (success, detected, expected) = run_test test in 556 let result : Report.test_result = { 557 test_num = i + 1; 558 description = Printf.sprintf "Detect %s encoding" expected; 559 input = String.escaped test.input; 560 expected; actual = detected; success; 561 details = [ 562 ("Input Length", string_of_int (String.length test.input)); 563 ("Has BOM", string_of_bool (String.length test.input >= 3 && 564 (String.sub test.input 0 3 = "\xEF\xBB\xBF" || 565 String.sub test.input 0 2 = "\xFF\xFE" || 566 String.sub test.input 0 2 = "\xFE\xFF"))); 567 ]; 568 raw_test_data = Some test.raw_lines; 569 } in 570 results := result :: !results; 571 if success then incr passed else incr failed 572 end 573 ) tests; 574 let file_result : Report.file_result = { 575 filename; test_type = "Encoding Detection"; 576 passed_count = !passed; failed_count = !failed; 577 tests = List.rev !results; 578 } in 579 (file_result, !passed, !failed) 580 581 let run_dir test_dir = 582 let files = Sys.readdir test_dir |> Array.to_list in 583 let dat_files = List.filter (fun f -> 584 Filename.check_suffix f ".dat" && not (String.contains f '/') 585 ) files in 586 let total_passed = ref 0 in 587 let total_failed = ref 0 in 588 let file_results = ref [] in 589 List.iter (fun file -> 590 let path = Filename.concat test_dir file in 591 if Sys.is_directory path then () else begin 592 let (file_result, passed, failed) = run_file path in 593 total_passed := !total_passed + passed; 594 total_failed := !total_failed + failed; 595 file_results := file_result :: !file_results; 596 Printf.printf " %s: %d passed, %d failed\n" file passed failed 597 end 598 ) (List.sort String.compare dat_files); 599 (List.rev !file_results, !total_passed, !total_failed) 600end 601 602(* ============================================================ *) 603(* Main Entry Point *) 604(* ============================================================ *) 605 606let () = 607 let base_dir = if Array.length Sys.argv > 1 then Sys.argv.(1) else "html5lib-tests" in 608 609 let all_files = ref [] in 610 let total_passed = ref 0 in 611 let total_failed = ref 0 in 612 613 (* Run Tree Construction Tests *) 614 Printf.printf "\n=== Tree Construction Tests ===\n"; 615 let tree_dir = Filename.concat base_dir "tree-construction" in 616 if Sys.file_exists tree_dir then begin 617 let (files, passed, failed) = TreeConstruction.run_dir tree_dir in 618 all_files := !all_files @ files; 619 total_passed := !total_passed + passed; 620 total_failed := !total_failed + failed; 621 Printf.printf " Subtotal: %d passed, %d failed\n" passed failed 622 end else 623 Printf.printf " (directory not found: %s)\n" tree_dir; 624 625 (* Run Tokenizer Tests *) 626 Printf.printf "\n=== Tokenizer Tests ===\n"; 627 let tok_dir = Filename.concat base_dir "tokenizer" in 628 if Sys.file_exists tok_dir then begin 629 let (files, passed, failed) = Tokenizer_tests.run_dir tok_dir in 630 all_files := !all_files @ files; 631 total_passed := !total_passed + passed; 632 total_failed := !total_failed + failed; 633 Printf.printf " Subtotal: %d passed, %d failed\n" passed failed 634 end else 635 Printf.printf " (directory not found: %s)\n" tok_dir; 636 637 (* Run Encoding Tests *) 638 Printf.printf "\n=== Encoding Detection Tests ===\n"; 639 let enc_dir = Filename.concat base_dir "encoding" in 640 if Sys.file_exists enc_dir then begin 641 let (files, passed, failed) = Encoding_tests.run_dir enc_dir in 642 all_files := !all_files @ files; 643 total_passed := !total_passed + passed; 644 total_failed := !total_failed + failed; 645 Printf.printf " Subtotal: %d passed, %d failed\n" passed failed 646 end else 647 Printf.printf " (directory not found: %s)\n" enc_dir; 648 649 (* Note: Serializer tests use the standalone test_serializer.exe for full implementation *) 650 651 Printf.printf "\n=== Overall Summary ===\n"; 652 Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed; 653 654 (* Generate combined HTML report *) 655 let report : Report.report = { 656 title = "HTML5 Parser Test Suite"; 657 test_type = "combined"; 658 description = "This is a comprehensive test report for the html5rw OCaml HTML5 parser library, \ 659 validating conformance against the official html5lib-tests test suite. \ 660 Tests cover: (1) Tree Construction - validating the DOM tree building algorithm; \ 661 (2) Tokenization - validating the HTML tokenizer state machine; \ 662 (3) Encoding Detection - validating character encoding sniffing. \ 663 Each test shows the input, expected output, actual output, and original test data."; 664 files = !all_files; 665 total_passed = !total_passed; 666 total_failed = !total_failed; 667 match_quality = None; 668 test_type_breakdown = None; 669 strictness_mode = None; 670 run_timestamp = None; 671 } in 672 Report.generate_report report "html5lib_test_report.html"; 673 674 exit (if !total_failed > 0 then 1 else 0)