OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 15 kB view raw
1(* Test runner for html5lib-tests tokenizer tests *) 2 3open Bytesrw 4 5module Tokenizer = Html5rw.Tokenizer 6module Report = Test_report 7 8(* Token collector sink - collects all tokens into a list *) 9module TokenCollector = struct 10 type t = { 11 mutable tokens : Html5rw.Tokenizer.Token.t list; 12 } 13 14 let create () = { tokens = [] } 15 16 let process t token ~line:_ ~column:_ = 17 t.tokens <- token :: t.tokens; 18 `Continue 19 20 let adjusted_current_node_in_html_namespace _ = true 21 22 let get_tokens t = List.rev t.tokens 23end 24 25(* Test case representation *) 26type test_case = { 27 description : string; 28 input : string; 29 output : Jsont.json list; 30 expected_error_count : int; 31 initial_states : string list; 32 last_start_tag : string option; 33 double_escaped : bool; 34 xml_mode : bool; 35 raw_json : string; (* Original JSON representation of this test *) 36} 37 38(* Unescape double-escaped strings from tests *) 39let unescape_double s = 40 let b = Buffer.create (String.length s) in 41 let i = ref 0 in 42 while !i < String.length s do 43 if !i + 1 < String.length s && s.[!i] = '\\' then begin 44 match s.[!i + 1] with 45 | 'u' when !i + 5 < String.length s -> 46 let hex = String.sub s (!i + 2) 4 in 47 (try 48 let code = int_of_string ("0x" ^ hex) in 49 if code < 128 then Buffer.add_char b (Char.chr code) 50 else begin 51 (* UTF-8 encode *) 52 if code < 0x800 then begin 53 Buffer.add_char b (Char.chr (0xC0 lor (code lsr 6))); 54 Buffer.add_char b (Char.chr (0x80 lor (code land 0x3F))) 55 end else begin 56 Buffer.add_char b (Char.chr (0xE0 lor (code lsr 12))); 57 Buffer.add_char b (Char.chr (0x80 lor ((code lsr 6) land 0x3F))); 58 Buffer.add_char b (Char.chr (0x80 lor (code land 0x3F))) 59 end 60 end; 61 i := !i + 6 62 with _ -> 63 Buffer.add_char b s.[!i]; 64 incr i) 65 | 'n' -> Buffer.add_char b '\n'; i := !i + 2 66 | 'r' -> Buffer.add_char b '\r'; i := !i + 2 67 | 't' -> Buffer.add_char b '\t'; i := !i + 2 68 | '\\' -> Buffer.add_char b '\\'; i := !i + 2 69 | _ -> Buffer.add_char b s.[!i]; incr i 70 end else begin 71 Buffer.add_char b s.[!i]; 72 incr i 73 end 74 done; 75 Buffer.contents b 76 77(* Extract string from JSON node *) 78let json_string = function 79 | Jsont.String (s, _) -> s 80 | _ -> failwith "Expected string" 81 82let json_bool = function 83 | Jsont.Bool (b, _) -> b 84 | _ -> failwith "Expected bool" 85 86let json_array = function 87 | Jsont.Array (arr, _) -> arr 88 | _ -> failwith "Expected array" 89 90let json_object = function 91 | Jsont.Object (obj, _) -> obj 92 | _ -> failwith "Expected object" 93 94let json_mem name obj = 95 match List.find_opt (fun ((n, _), _) -> n = name) obj with 96 | Some (_, v) -> Some v 97 | None -> None 98 99let json_mem_exn name obj = 100 match json_mem name obj with 101 | Some v -> v 102 | None -> failwith ("Missing member: " ^ name) 103 104(* Format JSON for display *) 105let rec json_to_string = function 106 | Jsont.Null _ -> "null" 107 | Jsont.Bool (b, _) -> string_of_bool b 108 | Jsont.Number (n, _) -> Printf.sprintf "%g" n 109 | Jsont.String (s, _) -> Printf.sprintf "%S" s 110 | Jsont.Array (arr, _) -> 111 "[" ^ String.concat ", " (List.map json_to_string arr) ^ "]" 112 | Jsont.Object (obj, _) -> 113 "{" ^ String.concat ", " (List.map (fun ((n, _), v) -> Printf.sprintf "%S: %s" n (json_to_string v)) obj) ^ "}" 114 115(* Parse a single test case from JSON *) 116let parse_test_case ~xml_mode json = 117 let raw_json = json_to_string json in 118 let obj = json_object json in 119 let description = json_string (json_mem_exn "description" obj) in 120 let input = json_string (json_mem_exn "input" obj) in 121 let output = json_array (json_mem_exn "output" obj) in 122 let expected_error_count = match json_mem "errors" obj with 123 | Some e -> List.length (json_array e) 124 | None -> 0 125 in 126 let initial_states = match json_mem "initialStates" obj with 127 | Some s -> List.map json_string (json_array s) 128 | None -> ["Data state"] 129 in 130 let last_start_tag = match json_mem "lastStartTag" obj with 131 | Some s -> Some (json_string s) 132 | None -> None 133 in 134 let double_escaped = match json_mem "doubleEscaped" obj with 135 | Some b -> json_bool b 136 | None -> false 137 in 138 { description; input; output; expected_error_count; initial_states; last_start_tag; double_escaped; xml_mode; raw_json } 139 140(* Convert state name to State.t *) 141let state_of_string = function 142 | "Data state" -> Html5rw.Tokenizer.State.Data 143 | "PLAINTEXT state" -> Html5rw.Tokenizer.State.Plaintext 144 | "RCDATA state" -> Html5rw.Tokenizer.State.Rcdata 145 | "RAWTEXT state" -> Html5rw.Tokenizer.State.Rawtext 146 | "Script data state" -> Html5rw.Tokenizer.State.Script_data 147 | "CDATA section state" -> Html5rw.Tokenizer.State.Cdata_section 148 | s -> failwith ("Unknown state: " ^ s) 149 150(* Convert our token to test format for comparison *) 151let token_to_test_json (tok : Html5rw.Tokenizer.Token.t) : Jsont.json list = 152 let str s = Jsont.String (s, Jsont.Meta.none) in 153 let arr l = Jsont.Array (l, Jsont.Meta.none) in 154 match tok with 155 | Html5rw.Tokenizer.Token.Doctype { name; public_id; system_id; force_quirks } -> 156 let name_json = match name with 157 | Some n -> str n 158 | None -> Jsont.Null ((), Jsont.Meta.none) 159 in 160 let public_json = match public_id with 161 | Some p -> str p 162 | None -> Jsont.Null ((), Jsont.Meta.none) 163 in 164 let system_json = match system_id with 165 | Some s -> str s 166 | None -> Jsont.Null ((), Jsont.Meta.none) 167 in 168 let correctness = Jsont.Bool (not force_quirks, Jsont.Meta.none) in 169 [arr [str "DOCTYPE"; name_json; public_json; system_json; correctness]] 170 | Html5rw.Tokenizer.Token.Tag { kind = Start; name; attrs; self_closing } -> 171 let attrs_obj = Jsont.Object ( 172 List.map (fun (n, v) -> ((n, Jsont.Meta.none), str v)) (List.rev attrs), 173 Jsont.Meta.none 174 ) in 175 if self_closing then 176 [arr [str "StartTag"; str name; attrs_obj; Jsont.Bool (true, Jsont.Meta.none)]] 177 else 178 [arr [str "StartTag"; str name; attrs_obj]] 179 | Html5rw.Tokenizer.Token.Tag { kind = End; name; _ } -> 180 [arr [str "EndTag"; str name]] 181 | Html5rw.Tokenizer.Token.Comment data -> 182 [arr [str "Comment"; str data]] 183 | Html5rw.Tokenizer.Token.Character data -> 184 (* Split into individual characters for comparison - but actually 185 the tests expect consecutive characters to be merged *) 186 [arr [str "Character"; str data]] 187 | Html5rw.Tokenizer.Token.EOF -> [] 188 189(* Compare JSON values for equality *) 190let rec json_equal a b = 191 match a, b with 192 | Jsont.Null _, Jsont.Null _ -> true 193 | Jsont.Bool (a, _), Jsont.Bool (b, _) -> a = b 194 | Jsont.Number (a, _), Jsont.Number (b, _) -> a = b 195 | Jsont.String (a, _), Jsont.String (b, _) -> a = b 196 | Jsont.Array (a, _), Jsont.Array (b, _) -> 197 List.length a = List.length b && 198 List.for_all2 json_equal a b 199 | Jsont.Object (a, _), Jsont.Object (b, _) -> 200 let a_sorted = List.sort (fun ((n1, _), _) ((n2, _), _) -> String.compare n1 n2) a in 201 let b_sorted = List.sort (fun ((n1, _), _) ((n2, _), _) -> String.compare n1 n2) b in 202 List.length a_sorted = List.length b_sorted && 203 List.for_all2 (fun ((n1, _), v1) ((n2, _), v2) -> n1 = n2 && json_equal v1 v2) a_sorted b_sorted 204 | _ -> false 205 206(* Merge consecutive Character tokens *) 207let merge_character_tokens tokens = 208 let rec loop acc = function 209 | [] -> List.rev acc 210 | Html5rw.Tokenizer.Token.Character s1 :: Html5rw.Tokenizer.Token.Character s2 :: rest -> 211 loop acc (Html5rw.Tokenizer.Token.Character (s1 ^ s2) :: rest) 212 | tok :: rest -> loop (tok :: acc) rest 213 in 214 loop [] tokens 215 216(* Run a single test *) 217let run_test test initial_state = 218 let input = if test.double_escaped then unescape_double test.input else test.input in 219 220 let collector = TokenCollector.create () in 221 let tokenizer = Html5rw.Tokenizer.create (module TokenCollector) collector ~collect_errors:true ~xml_mode:test.xml_mode () in 222 223 (* Set initial state *) 224 Html5rw.Tokenizer.set_state tokenizer initial_state; 225 226 (* Set last start tag if specified *) 227 (match test.last_start_tag with 228 | Some tag -> Html5rw.Tokenizer.set_last_start_tag tokenizer tag 229 | None -> ()); 230 231 (* Run tokenizer *) 232 let reader = Bytes.Reader.of_string input in 233 Html5rw.Tokenizer.run tokenizer (module TokenCollector) reader; 234 235 (* Get results *) 236 let tokens = merge_character_tokens (TokenCollector.get_tokens collector) in 237 let actual_tokens = List.concat_map token_to_test_json tokens in 238 239 (* Unescape expected output if double_escaped *) 240 let expected_output = if test.double_escaped then 241 let rec unescape_json = function 242 | Jsont.String (s, m) -> Jsont.String (unescape_double s, m) 243 | Jsont.Array (arr, m) -> Jsont.Array (List.map unescape_json arr, m) 244 | Jsont.Object (obj, m) -> 245 Jsont.Object (List.map (fun (n, v) -> (n, unescape_json v)) obj, m) 246 | other -> other 247 in 248 List.map unescape_json test.output 249 else test.output 250 in 251 252 (* Merge consecutive Character tokens in expected output too *) 253 let rec merge_expected = function 254 | [] -> [] 255 | [x] -> [x] 256 | Jsont.Array ([Jsont.String ("Character", _); Jsont.String (s1, m1)], am1) :: 257 Jsont.Array ([Jsont.String ("Character", _); Jsont.String (s2, _)], _) :: 258 rest -> 259 merge_expected (Jsont.Array ([Jsont.String ("Character", Jsont.Meta.none); Jsont.String (s1 ^ s2, m1)], am1) :: rest) 260 | x :: rest -> x :: merge_expected rest 261 in 262 let expected = merge_expected expected_output in 263 264 (* Compare *) 265 let tokens_match = 266 List.length actual_tokens = List.length expected && 267 List.for_all2 json_equal actual_tokens expected 268 in 269 270 let actual_error_count = List.length (Html5rw.Tokenizer.get_errors tokenizer) in 271 let errors_count_match = actual_error_count = test.expected_error_count in 272 273 (tokens_match && errors_count_match, actual_tokens, expected, actual_error_count, test.expected_error_count) 274 275(* Run all tests in a file *) 276let run_file path = 277 let content = 278 let ic = open_in path in 279 let n = in_channel_length ic in 280 let s = really_input_string ic n in 281 close_in ic; 282 s 283 in 284 285 (* Parse JSON *) 286 let json = match Jsont_bytesrw.decode_string Jsont.json content with 287 | Ok j -> j 288 | Error e -> failwith (Printf.sprintf "JSON parse error in %s: %s" path e) 289 in 290 291 let obj = json_object json in 292 293 (* Handle both {"tests": [...]} and {"xmlViolationTests": [...]} formats *) 294 let regular_tests = 295 match json_mem "tests" obj with 296 | Some t -> List.map (parse_test_case ~xml_mode:false) (json_array t) 297 | None -> [] 298 in 299 let xml_tests = 300 match json_mem "xmlViolationTests" obj with 301 | Some t -> List.map (parse_test_case ~xml_mode:true) (json_array t) 302 | None -> [] 303 in 304 let all_tests = regular_tests @ xml_tests in 305 306 let filename = Filename.basename path in 307 let passed = ref 0 in 308 let failed = ref 0 in 309 let results = ref [] in 310 311 List.iteri (fun i test -> 312 (* Run for each initial state *) 313 List.iter (fun state_name -> 314 try 315 let state = state_of_string state_name in 316 let (success, actual, expected, actual_err_count, expected_err_count) = run_test test state in 317 318 let description = Printf.sprintf "[%s] %s" state_name test.description in 319 let result : Report.test_result = { 320 test_num = i + 1; 321 description; 322 input = test.input; 323 expected = String.concat "\n" (List.map json_to_string expected); 324 actual = String.concat "\n" (List.map json_to_string actual); 325 success; 326 details = [ 327 ("Initial State", state_name); 328 ("Last Start Tag", Option.value test.last_start_tag ~default:"(none)"); 329 ("Double Escaped", string_of_bool test.double_escaped); 330 ("XML Mode", string_of_bool test.xml_mode); 331 ("Expected Errors", string_of_int expected_err_count); 332 ("Actual Errors", string_of_int actual_err_count); 333 ]; 334 raw_test_data = Some test.raw_json; 335 } in 336 results := result :: !results; 337 338 if success then incr passed else incr failed 339 with e -> 340 incr failed; 341 let result : Report.test_result = { 342 test_num = i + 1; 343 description = Printf.sprintf "[%s] %s" state_name test.description; 344 input = test.input; 345 expected = ""; 346 actual = Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e); 347 success = false; 348 details = []; 349 raw_test_data = Some test.raw_json; 350 } in 351 results := result :: !results; 352 Printf.eprintf "Exception in test %d (%s): %s\n" (i + 1) test.description (Printexc.to_string e) 353 ) test.initial_states 354 ) all_tests; 355 356 let file_result : Report.file_result = { 357 filename; 358 test_type = "Tokenizer"; 359 passed_count = !passed; 360 failed_count = !failed; 361 tests = List.rev !results; 362 } in 363 (file_result, !passed, !failed) 364 365let () = 366 let test_dir = Sys.argv.(1) in 367 let files = Sys.readdir test_dir |> Array.to_list in 368 let test_files = List.filter (fun f -> Filename.check_suffix f ".test") files in 369 370 let total_passed = ref 0 in 371 let total_failed = ref 0 in 372 let file_results = ref [] in 373 374 List.iter (fun file -> 375 let path = Filename.concat test_dir file in 376 let (file_result, passed, failed) = run_file path in 377 total_passed := !total_passed + passed; 378 total_failed := !total_failed + failed; 379 file_results := file_result :: !file_results; 380 Printf.printf "%s: %d passed, %d failed\n" file passed failed 381 ) (List.sort String.compare test_files); 382 383 Printf.printf "\n=== Summary ===\n"; 384 Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed; 385 386 (* Generate HTML report *) 387 let report : Report.report = { 388 title = "HTML5 Tokenizer Tests"; 389 test_type = "tokenizer"; 390 description = "These tests validate the HTML5 tokenization algorithm as specified in the WHATWG HTML Standard. \ 391 The tokenizer converts HTML input into a stream of tokens (DOCTYPE, start tags, end tags, comments, \ 392 character data, and EOF). Each test specifies input HTML, expected tokens in JSON array format, \ 393 and the initial tokenizer state. Tests cover normal parsing, RCDATA/RAWTEXT/PLAINTEXT states, \ 394 script data parsing, CDATA sections, and various error conditions. Some tests are double-escaped \ 395 to represent special characters. XML violation tests check behavior differences from XML mode."; 396 files = List.rev !file_results; 397 total_passed = !total_passed; 398 total_failed = !total_failed; 399 match_quality = None; 400 test_type_breakdown = None; 401 strictness_mode = None; 402 run_timestamp = None; 403 } in 404 Report.generate_report report "test_tokenizer_report.html"; 405 406 exit (if !total_failed > 0 then 1 else 0)