OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* Test runner for html5lib-tests tree construction tests *) 2 3open Bytesrw 4 5module Parser = Html5rw.Parser 6module Dom = Html5rw.Dom 7module Report = Test_report 8 9type test_case = { 10 input : string; 11 expected_tree : string; 12 expected_errors : string list; 13 script_on : bool; 14 fragment_context : string option; 15 raw_lines : string; (* Original test data from .dat file *) 16} 17 18let _is_blank s = String.trim s = "" 19 20(* Parse a single test case from lines *) 21let parse_test_case lines = 22 let raw_lines = String.concat "\n" lines in 23 let rec parse acc = function 24 | [] -> acc 25 | line :: rest when String.length line > 0 && line.[0] = '#' -> 26 let section = String.trim line in 27 let content, remaining = collect_section rest in 28 parse ((section, content) :: acc) remaining 29 | _ :: rest -> parse acc rest 30 and collect_section lines = 31 let rec loop acc = function 32 | [] -> (List.rev acc, []) 33 | line :: rest when String.length line > 0 && line.[0] = '#' -> 34 (List.rev acc, line :: rest) 35 | line :: rest -> loop (line :: acc) rest 36 in 37 loop [] lines 38 in 39 let sections = parse [] lines in 40 41 let get_section name = 42 match List.assoc_opt name sections with 43 | Some lines -> String.concat "\n" lines 44 | None -> "" 45 in 46 47 let data = get_section "#data" in 48 let document = get_section "#document" in 49 let errors_text = get_section "#errors" in 50 let errors = 51 String.split_on_char '\n' errors_text 52 |> List.filter (fun s -> String.trim s <> "") 53 in 54 let script_on = List.mem_assoc "#script-on" sections in 55 let fragment = 56 if List.mem_assoc "#document-fragment" sections then 57 Some (get_section "#document-fragment" |> String.trim) 58 else None 59 in 60 61 { 62 input = data; 63 expected_tree = document; 64 expected_errors = errors; 65 script_on; 66 fragment_context = fragment; 67 raw_lines; 68 } 69 70(* Parse a .dat file into test cases *) 71let parse_dat_file content = 72 let lines = String.split_on_char '\n' content in 73 (* Split on empty lines followed by #data *) 74 let rec split_tests current acc = function 75 | [] -> 76 if current = [] then List.rev acc 77 else List.rev (List.rev current :: acc) 78 | "" :: "#data" :: rest -> 79 (* End of current test, start new one *) 80 let new_acc = if current = [] then acc else (List.rev current :: acc) in 81 split_tests ["#data"] new_acc rest 82 | line :: rest -> 83 split_tests (line :: current) acc rest 84 in 85 let test_groups = split_tests [] [] lines in 86 List.filter_map (fun lines -> 87 if List.exists (fun l -> l = "#data") lines then 88 Some (parse_test_case lines) 89 else None 90 ) test_groups 91 92(* Strip "| " prefix from each line *) 93let strip_tree_prefix s = 94 let lines = String.split_on_char '\n' s in 95 let stripped = List.filter_map (fun line -> 96 if String.length line >= 2 && String.sub line 0 2 = "| " then 97 Some (String.sub line 2 (String.length line - 2)) 98 else if String.trim line = "" then None 99 else Some line 100 ) lines in 101 String.concat "\n" stripped 102 103(* Normalize tree output for comparison *) 104let normalize_tree s = 105 let lines = String.split_on_char '\n' s in 106 let non_empty = List.filter (fun l -> String.trim l <> "") lines in 107 String.concat "\n" non_empty 108 109let run_test test = 110 try 111 let result = 112 match test.fragment_context with 113 | Some ctx_str -> 114 (* Parse "namespace element" or just "element" *) 115 let (namespace, tag_name) = 116 match String.split_on_char ' ' ctx_str with 117 | [ns; tag] when ns = "svg" -> (Some "svg", tag) 118 | [ns; tag] when ns = "math" -> (Some "mathml", tag) 119 | [tag] -> (None, tag) 120 | _ -> (None, ctx_str) 121 in 122 let context = Html5rw.Parser.make_fragment_context ~tag_name ~namespace () in 123 let reader = Bytes.Reader.of_string test.input in 124 Html5rw.Parser.parse ~collect_errors:true ~fragment_context:context reader 125 | None -> 126 let reader = Bytes.Reader.of_string test.input in 127 Html5rw.Parser.parse ~collect_errors:true reader 128 in 129 let actual_tree = Html5rw.Dom.to_test_format (Html5rw.Parser.root result) in 130 let expected = normalize_tree (strip_tree_prefix test.expected_tree) in 131 let actual = normalize_tree (strip_tree_prefix actual_tree) in 132 let error_count = List.length (Html5rw.Parser.errors result) in 133 let expected_error_count = List.length test.expected_errors in 134 (expected = actual, expected, actual, error_count, expected_error_count) 135 with e -> 136 let expected = normalize_tree (strip_tree_prefix test.expected_tree) in 137 (false, expected, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), 0, 0) 138 139let run_file path = 140 let ic = open_in path in 141 let content = really_input_string ic (in_channel_length ic) in 142 close_in ic; 143 144 let tests = parse_dat_file content in 145 let filename = Filename.basename path in 146 147 let passed = ref 0 in 148 let failed = ref 0 in 149 let results = ref [] in 150 151 List.iteri (fun i test -> 152 (* Skip script-on tests since we don't support scripting *) 153 if test.script_on then 154 () (* Skip this test *) 155 else begin 156 let (success, expected, actual, actual_error_count, expected_error_count) = run_test test in 157 let description = 158 let input_preview = String.sub test.input 0 (min 60 (String.length test.input)) in 159 if test.fragment_context <> None then 160 Printf.sprintf "Fragment (%s): %s" (Option.get test.fragment_context) input_preview 161 else 162 input_preview 163 in 164 let details = [ 165 ("Fragment Context", Option.value test.fragment_context ~default:"(none)"); 166 ("Expected Errors", string_of_int expected_error_count); 167 ("Actual Errors", string_of_int actual_error_count); 168 ] in 169 let result : Report.test_result = { 170 test_num = i + 1; 171 description; 172 input = test.input; 173 expected; 174 actual; 175 success; 176 details; 177 raw_test_data = Some test.raw_lines; 178 } in 179 results := result :: !results; 180 if success then incr passed else incr failed 181 end 182 ) tests; 183 184 let file_result : Report.file_result = { 185 filename; 186 test_type = "Tree Construction"; 187 passed_count = !passed; 188 failed_count = !failed; 189 tests = List.rev !results; 190 } in 191 (file_result, !passed, !failed) 192 193let () = 194 let test_dir = Sys.argv.(1) in 195 let files = Sys.readdir test_dir |> Array.to_list in 196 let dat_files = List.filter (fun f -> 197 Filename.check_suffix f ".dat" && 198 not (String.contains f '/') (* Skip subdirectories *) 199 ) files in 200 201 let total_passed = ref 0 in 202 let total_failed = ref 0 in 203 let file_results = ref [] in 204 205 List.iter (fun file -> 206 let path = Filename.concat test_dir file in 207 if Sys.is_directory path then () else begin 208 let (file_result, passed, failed) = run_file path in 209 total_passed := !total_passed + passed; 210 total_failed := !total_failed + failed; 211 file_results := file_result :: !file_results; 212 Printf.printf "%s: %d passed, %d failed\n" file passed failed 213 end 214 ) (List.sort String.compare dat_files); 215 216 Printf.printf "\n=== Summary ===\n"; 217 Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed; 218 219 (* Generate HTML report *) 220 let report : Report.report = { 221 title = "HTML5 Tree Construction Tests"; 222 test_type = "tree-construction"; 223 description = "These tests validate the HTML5 tree construction algorithm as specified in the WHATWG HTML Standard. \ 224 Each test provides HTML input and the expected DOM tree structure. The parser processes the HTML and \ 225 builds a document tree, which is then serialized and compared against the expected output. \ 226 Tests cover various edge cases including malformed HTML, implicit element creation, foster parenting, \ 227 adoption agency algorithm, and foreign content (SVG/MathML). Fragment parsing tests verify parsing \ 228 in the context of specific elements."; 229 files = List.rev !file_results; 230 total_passed = !total_passed; 231 total_failed = !total_failed; 232 } in 233 Report.generate_report report "test_html5lib_report.html"; 234 235 exit (if !total_failed > 0 then 1 else 0)