OCaml HTML5 parser/serialiser based on Python's JustHTML

sync

+3
.gitmodules
··· 1 + [submodule "html5lib-tests"] 2 + path = html5lib-tests 3 + url = https://github.com/html5lib/html5lib-tests
+1 -1
dune-project
··· 18 18 (bytesrw (>= 0.3.0)) 19 19 (uutf (>= 1.0.0)) 20 20 (re (>= 1.10.0)) 21 - (yojson (and :build (>= 2.0.0))))) 21 + (jsont (and :with-test (>= 0.2.0)))))
+1 -1
html5rw.opam
··· 15 15 "bytesrw" {>= "0.3.0"} 16 16 "uutf" {>= "1.0.0"} 17 17 "re" {>= "1.10.0"} 18 - "yojson" {build & >= "2.0.0"} 18 + "jsont" {with-test & >= "0.2.0"} 19 19 "odoc" {with-doc} 20 20 ] 21 21 build: [
+16 -2
lib/parser/tree_builder.ml
··· 2312 2312 2313 2313 and process_foreign_content t token = 2314 2314 match token with 2315 - | Token.Character "\x00" -> 2315 + | Token.Character data when String.contains data '\x00' -> 2316 + (* Replace NUL characters with U+FFFD replacement character *) 2316 2317 parse_error t "unexpected-null-character"; 2317 - insert_character t "\xEF\xBF\xBD" 2318 + let buf = Buffer.create (String.length data) in 2319 + let has_non_ws_non_nul = ref false in 2320 + String.iter (fun c -> 2321 + if c = '\x00' then Buffer.add_string buf "\xEF\xBF\xBD" 2322 + else begin 2323 + Buffer.add_char buf c; 2324 + if not (c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r') then 2325 + has_non_ws_non_nul := true 2326 + end 2327 + ) data; 2328 + let replaced = Buffer.contents buf in 2329 + insert_character t replaced; 2330 + (* Only set frameset_ok to false if there's actual non-whitespace non-NUL content *) 2331 + if !has_non_ws_non_nul then t.frameset_ok <- false 2318 2332 | Token.Character data when is_whitespace data -> 2319 2333 insert_character t data 2320 2334 | Token.Character data ->
+13 -3
lib/tokenizer/stream.ml
··· 20 20 mutable column : int; 21 21 (* Track if we just saw CR (for CR/LF normalization) *) 22 22 mutable last_was_cr : bool; 23 + (* Track if we need to skip the next LF from raw stream (set after peek of CR) *) 24 + mutable skip_next_lf : bool; 23 25 } 24 26 25 27 (* Create a stream from a Bytes.Reader.t *) ··· 33 35 line = 1; 34 36 column = 0; 35 37 last_was_cr = false; 38 + skip_next_lf = false; 36 39 } 37 40 38 41 (* Create a stream from a string - discouraged, prefer create_from_reader *) ··· 83 86 None 84 87 | Some '\r' -> 85 88 t.last_was_cr <- true; 86 - Some '\n' (* CR becomes LF *) 89 + (* Immediately consume following LF if present (CRLF -> single LF) *) 90 + (match read_raw_char t with 91 + | Some '\n' -> () (* Consume the LF that follows CR *) 92 + | Some c -> push_back_char t c (* Put non-LF char back *) 93 + | None -> ()); 94 + Some '\n' (* CR (or CRLF) becomes single LF *) 87 95 | Some '\n' when t.last_was_cr -> 88 96 (* Skip LF after CR - it was already converted *) 89 97 t.last_was_cr <- false; ··· 102 110 Bytes.Slice.is_eod next))) 103 111 104 112 let peek t = 113 + (* Save last_was_cr state before reading *) 114 + let saved_last_was_cr = t.last_was_cr in 105 115 match read_normalized_char t with 106 116 | None -> None 107 117 | Some c -> 108 118 push_back_char t c; 109 - (* Undo last_was_cr if we pushed back a CR-converted LF *) 110 - if c = '\n' then t.last_was_cr <- false; 119 + (* Restore the last_was_cr state so advance handles CR/LF correctly *) 120 + t.last_was_cr <- saved_last_was_cr; 111 121 Some c 112 122 113 123 (* Read n characters into a list, returns (chars_read, all_read_successfully) *)
+62 -23
lib/tokenizer/tokenizer.ml
··· 150 150 let emit_current_tag () = 151 151 finish_attribute t; 152 152 let name = Buffer.contents t.current_tag_name in 153 + let attrs = List.rev t.current_attrs in 154 + (* Check for end tag with attributes or self-closing flag *) 155 + if t.current_tag_kind = Token.End then begin 156 + if attrs <> [] then 157 + error t "end-tag-with-attributes"; 158 + if t.current_tag_self_closing then 159 + error t "end-tag-with-trailing-solidus" 160 + end; 153 161 let tag = { 154 162 Token.kind = t.current_tag_kind; 155 163 name; 156 - attrs = List.rev t.current_attrs; 164 + attrs; 157 165 self_closing = t.current_tag_self_closing; 158 166 } in 159 167 if t.current_tag_kind = Token.Start then ··· 173 181 174 182 let emit_current_comment () = 175 183 emit (Token.Comment (Buffer.contents t.current_comment)) 184 + in 185 + 186 + (* Check for control characters and emit error if needed *) 187 + let check_control_char c = 188 + let code = Char.code c in 189 + (* Control chars: U+0001-U+0008, U+000B, U+000E-U+001F, U+007F-U+009F *) 190 + (* Allowed: U+0009 (tab), U+000A (LF), U+000C (FF), U+000D (CR) *) 191 + if (code >= 0x01 && code <= 0x08) || 192 + code = 0x0B || 193 + (code >= 0x0E && code <= 0x1F) || 194 + (code >= 0x7F && code <= 0x9F) then 195 + error t "control-character-in-input-stream" 196 + in 197 + 198 + (* Emit char with control character check *) 199 + let emit_char_checked c = 200 + check_control_char c; 201 + emit_char t c 176 202 in 177 203 178 204 let rec process_state () = ··· 374 400 flush_code_points_consumed_as_char_ref t; 375 401 t.state <- t.return_state; 376 402 handle_eof () 377 - | State.Named_character_reference 378 - | State.Numeric_character_reference 403 + | State.Named_character_reference -> 404 + flush_code_points_consumed_as_char_ref t; 405 + t.state <- t.return_state; 406 + handle_eof () 407 + | State.Numeric_character_reference -> 408 + (* At EOF with just "&#" - no digits follow *) 409 + error t "absence-of-digits-in-numeric-character-reference"; 410 + flush_code_points_consumed_as_char_ref t; 411 + t.state <- t.return_state; 412 + handle_eof () 379 413 | State.Hexadecimal_character_reference_start 380 - | State.Decimal_character_reference_start 381 - | State.Numeric_character_reference_end -> 414 + | State.Decimal_character_reference_start -> 415 + error t "absence-of-digits-in-numeric-character-reference"; 382 416 flush_code_points_consumed_as_char_ref t; 383 417 t.state <- t.return_state; 418 + handle_eof () 419 + | State.Numeric_character_reference_end -> 420 + (* We have collected digits, just need to finalize the character reference *) 421 + step (); 384 422 handle_eof () 385 423 | State.Ambiguous_ampersand -> 386 424 (* Buffer was already flushed when entering this state, just transition *) ··· 508 546 error t "unexpected-null-character"; 509 547 ignore (S.process t.sink (Token.Character "\x00")) 510 548 | Some c -> 511 - emit_char t c 549 + emit_char_checked c 512 550 | None -> () 513 551 514 552 and state_rcdata () = ··· 522 560 error t "unexpected-null-character"; 523 561 emit_str t "\xEF\xBF\xBD" 524 562 | Some c -> 525 - emit_char t c 563 + emit_char_checked c 526 564 | None -> () 527 565 528 566 and state_rawtext () = ··· 533 571 error t "unexpected-null-character"; 534 572 emit_str t "\xEF\xBF\xBD" 535 573 | Some c -> 536 - emit_char t c 574 + emit_char_checked c 537 575 | None -> () 538 576 539 577 and state_script_data () = ··· 544 582 error t "unexpected-null-character"; 545 583 emit_str t "\xEF\xBF\xBD" 546 584 | Some c -> 547 - emit_char t c 585 + emit_char_checked c 548 586 | None -> () 549 587 550 588 and state_plaintext () = ··· 553 591 error t "unexpected-null-character"; 554 592 emit_str t "\xEF\xBF\xBD" 555 593 | Some c -> 556 - emit_char t c 594 + emit_char_checked c 557 595 | None -> () 558 596 559 597 and state_tag_open () = ··· 765 803 error t "unexpected-null-character"; 766 804 emit_str t "\xEF\xBF\xBD" 767 805 | Some c -> 768 - emit_char t c 806 + emit_char_checked c 769 807 | None -> () 770 808 771 809 and state_script_data_escaped_dash () = ··· 781 819 emit_str t "\xEF\xBF\xBD" 782 820 | Some c -> 783 821 t.state <- State.Script_data_escaped; 784 - emit_char t c 822 + emit_char_checked c 785 823 | None -> () 786 824 787 825 and state_script_data_escaped_dash_dash () = ··· 799 837 emit_str t "\xEF\xBF\xBD" 800 838 | Some c -> 801 839 t.state <- State.Script_data_escaped; 802 - emit_char t c 840 + emit_char_checked c 803 841 | None -> () 804 842 805 843 and state_script_data_escaped_less_than_sign () = ··· 875 913 error t "unexpected-null-character"; 876 914 emit_str t "\xEF\xBF\xBD" 877 915 | Some c -> 878 - emit_char t c 916 + emit_char_checked c 879 917 | None -> () 880 918 881 919 and state_script_data_double_escaped_dash () = ··· 892 930 emit_str t "\xEF\xBF\xBD" 893 931 | Some c -> 894 932 t.state <- State.Script_data_double_escaped; 895 - emit_char t c 933 + emit_char_checked c 896 934 | None -> () 897 935 898 936 and state_script_data_double_escaped_dash_dash () = ··· 911 949 emit_str t "\xEF\xBF\xBD" 912 950 | Some c -> 913 951 t.state <- State.Script_data_double_escaped; 914 - emit_char t c 952 + emit_char_checked c 915 953 | None -> () 916 954 917 955 and state_script_data_double_escaped_less_than_sign () = ··· 1570 1608 match Stream.consume t.stream with 1571 1609 | Some ']' -> 1572 1610 t.state <- State.Cdata_section_bracket 1573 - | Some '\x00' -> 1574 - error t "unexpected-null-character"; 1575 - emit_str t "\xEF\xBF\xBD" 1576 1611 | Some c -> 1612 + (* CDATA section emits all characters as-is, including NUL *) 1577 1613 emit_char t c 1578 1614 | None -> () 1579 1615 ··· 1703 1739 t.state <- t.return_state 1704 1740 end 1705 1741 | None -> 1706 - (* No match - check if we should report ambiguous ampersand *) 1742 + (* No match - check if we should report unknown-named-character-reference *) 1707 1743 if String.length entity_name > 0 then begin 1708 - t.state <- State.Ambiguous_ampersand; 1709 - (* Reset position - we need to emit the ampersand and chars *) 1710 - flush_code_points_consumed_as_char_ref t 1744 + (* If we have a semicolon, it's definitely an unknown named character reference *) 1745 + if has_semicolon then 1746 + error t "unknown-named-character-reference"; 1747 + (* Emit all the chars we consumed *) 1748 + flush_code_points_consumed_as_char_ref t; 1749 + t.state <- t.return_state 1711 1750 end else begin 1712 1751 flush_code_points_consumed_as_char_ref t; 1713 1752 t.state <- t.return_state
+40
test/dune
··· 2 2 (name test_html5lib) 3 3 (libraries bytesrw html5rw.parser html5rw.dom)) 4 4 5 + (rule 6 + (alias runtest) 7 + (deps 8 + (glob_files ../html5lib-tests/tree-construction/*.dat)) 9 + (action 10 + (run %{exe:test_html5lib.exe} ../html5lib-tests/tree-construction))) 11 + 12 + (executable 13 + (name test_tokenizer) 14 + (libraries bytesrw html5rw.tokenizer jsont jsont.bytesrw)) 15 + 16 + (rule 17 + (alias runtest) 18 + (deps 19 + (glob_files ../html5lib-tests/tokenizer/*.test)) 20 + (action 21 + (run %{exe:test_tokenizer.exe} ../html5lib-tests/tokenizer))) 22 + 23 + (executable 24 + (name test_encoding) 25 + (libraries html5rw.encoding)) 26 + 27 + (rule 28 + (alias runtest) 29 + (deps 30 + (glob_files ../html5lib-tests/encoding/*.dat)) 31 + (action 32 + (run %{exe:test_encoding.exe} ../html5lib-tests/encoding))) 33 + 34 + (executable 35 + (name test_serializer) 36 + (libraries html5rw.dom jsont jsont.bytesrw)) 37 + 38 + (rule 39 + (alias runtest) 40 + (deps 41 + (glob_files ../html5lib-tests/serializer/*.test)) 42 + (action 43 + (run %{exe:test_serializer.exe} ../html5lib-tests/serializer))) 44 + 5 45 (executable 6 46 (name debug_fragment) 7 47 (libraries bytesrw html5rw.parser html5rw.dom))
+167
test/test_encoding.ml
··· 1 + (* Test runner for html5lib-tests encoding tests *) 2 + 3 + module Encoding = Html5rw_encoding 4 + 5 + type test_case = { 6 + input : string; 7 + expected_encoding : string; 8 + } 9 + 10 + (* Normalize encoding name for comparison *) 11 + let normalize_encoding_name s = 12 + String.lowercase_ascii (String.trim s) 13 + 14 + (* Convert our encoding type to canonical test name *) 15 + let encoding_to_test_name = function 16 + | Encoding.Utf8 -> "utf-8" 17 + | Encoding.Utf16le -> "utf-16le" 18 + | Encoding.Utf16be -> "utf-16be" 19 + | Encoding.Windows_1252 -> "windows-1252" 20 + | Encoding.Iso_8859_2 -> "iso-8859-2" 21 + | Encoding.Euc_jp -> "euc-jp" 22 + 23 + (* Parse a single test case from lines *) 24 + let parse_test_case lines = 25 + let rec parse acc = function 26 + | [] -> acc 27 + | line :: rest when String.length line > 0 && line.[0] = '#' -> 28 + let section = String.trim line in 29 + let content, remaining = collect_section rest in 30 + parse ((section, content) :: acc) remaining 31 + | _ :: rest -> parse acc rest 32 + and collect_section lines = 33 + let rec loop acc = function 34 + | [] -> (List.rev acc, []) 35 + | line :: rest when String.length line > 0 && line.[0] = '#' -> 36 + (List.rev acc, line :: rest) 37 + | line :: rest -> loop (line :: acc) rest 38 + in 39 + loop [] lines 40 + in 41 + let sections = parse [] lines in 42 + 43 + let get_section name = 44 + match List.assoc_opt name sections with 45 + | Some lines -> String.concat "\n" lines 46 + | None -> "" 47 + in 48 + 49 + let data = get_section "#data" in 50 + let encoding = get_section "#encoding" in 51 + 52 + { input = data; expected_encoding = String.trim encoding } 53 + 54 + (* Parse a .dat file into test cases *) 55 + let parse_dat_file content = 56 + let lines = String.split_on_char '\n' content in 57 + (* Split on empty lines followed by #data *) 58 + let rec split_tests current acc = function 59 + | [] -> 60 + if current = [] then List.rev acc 61 + else List.rev (List.rev current :: acc) 62 + | "" :: "#data" :: rest -> 63 + let new_acc = if current = [] then acc else (List.rev current :: acc) in 64 + split_tests ["#data"] new_acc rest 65 + | line :: rest -> 66 + split_tests (line :: current) acc rest 67 + in 68 + let test_groups = split_tests [] [] lines in 69 + List.filter_map (fun lines -> 70 + if List.exists (fun l -> l = "#data") lines then 71 + Some (parse_test_case lines) 72 + else None 73 + ) test_groups 74 + 75 + (* Run a single encoding test *) 76 + let run_test test = 77 + try 78 + (* Detect encoding from the input bytes *) 79 + let (_, detected_encoding) = Encoding.decode (Bytes.of_string test.input) () in 80 + let detected_name = encoding_to_test_name detected_encoding in 81 + let expected_name = normalize_encoding_name test.expected_encoding in 82 + 83 + (* Compare - allow some flexibility in naming *) 84 + let match_encoding det exp = 85 + det = exp || 86 + (det = "windows-1252" && (exp = "windows-1252" || exp = "cp1252" || exp = "iso-8859-1")) || 87 + (det = "iso-8859-2" && (exp = "iso-8859-2" || exp = "iso8859-2" || exp = "latin2")) || 88 + (det = "utf-8" && (exp = "utf-8" || exp = "utf8")) || 89 + (det = "euc-jp" && (exp = "euc-jp" || exp = "eucjp")) 90 + in 91 + 92 + (match_encoding detected_name expected_name, detected_name, expected_name) 93 + with e -> 94 + (false, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), test.expected_encoding) 95 + 96 + (* Run all tests in a file *) 97 + let run_file path = 98 + let ic = open_in path in 99 + let content = really_input_string ic (in_channel_length ic) in 100 + close_in ic; 101 + 102 + let tests = parse_dat_file content in 103 + let filename = Filename.basename path in 104 + 105 + let passed = ref 0 in 106 + let failed = ref 0 in 107 + let errors = ref [] in 108 + 109 + List.iteri (fun i test -> 110 + if String.trim test.expected_encoding = "" then 111 + (* Skip tests without expected encoding *) 112 + () 113 + else begin 114 + let (success, detected, expected) = run_test test in 115 + if success then 116 + incr passed 117 + else begin 118 + incr failed; 119 + if List.length !errors < 5 then 120 + errors := (i + 1, test.input, detected, expected) :: !errors 121 + end 122 + end 123 + ) tests; 124 + 125 + (!passed, !failed, List.rev !errors, filename) 126 + 127 + let () = 128 + let test_dir = Sys.argv.(1) in 129 + let files = Sys.readdir test_dir |> Array.to_list in 130 + let dat_files = List.filter (fun f -> 131 + Filename.check_suffix f ".dat" && 132 + not (String.contains f '/') 133 + ) files in 134 + 135 + let total_passed = ref 0 in 136 + let total_failed = ref 0 in 137 + let all_errors = ref [] in 138 + 139 + List.iter (fun file -> 140 + let path = Filename.concat test_dir file in 141 + if Sys.is_directory path then () else begin 142 + let (passed, failed, errors, filename) = run_file path in 143 + total_passed := !total_passed + passed; 144 + total_failed := !total_failed + failed; 145 + if errors <> [] then 146 + all_errors := (filename, errors) :: !all_errors; 147 + Printf.printf "%s: %d passed, %d failed\n" filename passed failed 148 + end 149 + ) (List.sort String.compare dat_files); 150 + 151 + Printf.printf "\n=== Summary ===\n"; 152 + Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed; 153 + 154 + if !all_errors <> [] then begin 155 + Printf.printf "\n=== First failures ===\n"; 156 + List.iter (fun (filename, errors) -> 157 + List.iter (fun (test_num, input, detected, expected) -> 158 + Printf.printf "\n--- %s test %d ---\n" filename test_num; 159 + Printf.printf "Input (first 200 chars): %s\n" 160 + (String.escaped (String.sub input 0 (min 200 (String.length input)))); 161 + Printf.printf "Expected encoding: %s\n" expected; 162 + Printf.printf "Detected encoding: %s\n" detected 163 + ) errors 164 + ) (List.rev !all_errors) 165 + end; 166 + 167 + exit (if !total_failed > 0 then 1 else 0)
+297
test/test_serializer.ml
··· 1 + (* Test runner for html5lib-tests serializer tests *) 2 + 3 + module Dom = Html5rw_dom 4 + 5 + (* Extract values from JSON *) 6 + let json_string = function 7 + | Jsont.String (s, _) -> s 8 + | _ -> failwith "Expected string" 9 + 10 + let json_string_opt = function 11 + | Jsont.Null _ -> None 12 + | Jsont.String (s, _) -> Some s 13 + | _ -> failwith "Expected string or null" 14 + 15 + let json_array = function 16 + | Jsont.Array (arr, _) -> arr 17 + | _ -> failwith "Expected array" 18 + 19 + let json_object = function 20 + | Jsont.Object (obj, _) -> obj 21 + | _ -> failwith "Expected object" 22 + 23 + let json_mem name obj = 24 + match List.find_opt (fun ((n, _), _) -> n = name) obj with 25 + | Some (_, v) -> Some v 26 + | None -> None 27 + 28 + let json_mem_exn name obj = 29 + match json_mem name obj with 30 + | Some v -> v 31 + | None -> failwith ("Missing member: " ^ name) 32 + 33 + (* Test case *) 34 + type test_case = { 35 + description : string; 36 + input : Jsont.json list; 37 + expected : string list; 38 + } 39 + 40 + let parse_test_case json = 41 + let obj = json_object json in 42 + let description = json_string (json_mem_exn "description" obj) in 43 + let input = json_array (json_mem_exn "input" obj) in 44 + let expected = List.map json_string (json_array (json_mem_exn "expected" obj)) in 45 + { description; input; expected } 46 + 47 + (* Build a DOM node from test input token *) 48 + let build_node_from_token token = 49 + let arr = json_array token in 50 + match arr with 51 + | [] -> None 52 + | type_json :: rest -> 53 + let token_type = json_string type_json in 54 + match token_type, rest with 55 + | "StartTag", [_ns_json; name_json; attrs_json] -> 56 + let name = json_string name_json in 57 + let attrs_list = json_array attrs_json in 58 + let attrs = List.map (fun attr_json -> 59 + let attr_obj = json_object attr_json in 60 + let attr_name = json_string (json_mem_exn "name" attr_obj) in 61 + let value = json_string (json_mem_exn "value" attr_obj) in 62 + (attr_name, value) 63 + ) attrs_list in 64 + Some (Dom.create_element name ~attrs ()) 65 + 66 + | "StartTag", [name_json; attrs_json] -> 67 + let name = json_string name_json in 68 + let attrs_obj = json_object attrs_json in 69 + let attrs = List.map (fun ((n, _), v) -> (n, json_string v)) attrs_obj in 70 + Some (Dom.create_element name ~attrs ()) 71 + 72 + | "EmptyTag", [name_json; attrs_json] -> 73 + let name = json_string name_json in 74 + let attrs_obj = json_object attrs_json in 75 + let attrs = List.map (fun ((n, _), v) -> (n, json_string v)) attrs_obj in 76 + Some (Dom.create_element name ~attrs ()) 77 + 78 + | "Characters", [text_json] -> 79 + let text = json_string text_json in 80 + Some (Dom.create_text text) 81 + 82 + | "Comment", [text_json] -> 83 + let text = json_string text_json in 84 + Some (Dom.create_comment text) 85 + 86 + | "Doctype", [name_json] -> 87 + let name = json_string name_json in 88 + Some (Dom.create_doctype ~name ()) 89 + 90 + | "Doctype", [name_json; public_json] -> 91 + let name = json_string name_json in 92 + let public_id = json_string_opt public_json in 93 + (match public_id with 94 + | Some pub -> Some (Dom.create_doctype ~name ~public_id:pub ()) 95 + | None -> Some (Dom.create_doctype ~name ())) 96 + 97 + | "Doctype", [name_json; public_json; system_json] -> 98 + let name = json_string name_json in 99 + let public_id = json_string_opt public_json in 100 + let system_id = json_string_opt system_json in 101 + (match public_id, system_id with 102 + | Some pub, Some sys -> Some (Dom.create_doctype ~name ~public_id:pub ~system_id:sys ()) 103 + | Some pub, None -> Some (Dom.create_doctype ~name ~public_id:pub ()) 104 + | None, Some sys -> Some (Dom.create_doctype ~name ~system_id:sys ()) 105 + | None, None -> Some (Dom.create_doctype ~name ())) 106 + 107 + | _ -> None 108 + 109 + (* Serialize a single node to HTML (simplified, matches test expectations) *) 110 + let escape_text text = 111 + let buf = Buffer.create (String.length text) in 112 + String.iter (fun c -> 113 + match c with 114 + | '&' -> Buffer.add_string buf "&amp;" 115 + | '<' -> Buffer.add_string buf "&lt;" 116 + | '>' -> Buffer.add_string buf "&gt;" 117 + | c -> Buffer.add_char buf c 118 + ) text; 119 + Buffer.contents buf 120 + 121 + let can_unquote_attr_value value = 122 + if String.length value = 0 then false 123 + else 124 + let valid = ref true in 125 + String.iter (fun c -> 126 + if c = '>' || c = '"' || c = '\'' || c = '=' || c = '`' || 127 + c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' then 128 + valid := false 129 + ) value; 130 + !valid 131 + 132 + let choose_quote value = 133 + if String.contains value '"' && not (String.contains value '\'') then '\'' 134 + else '"' 135 + 136 + let escape_attr_value value quote_char = 137 + let buf = Buffer.create (String.length value) in 138 + String.iter (fun c -> 139 + match c with 140 + | '&' -> Buffer.add_string buf "&amp;" 141 + | '"' when quote_char = '"' -> Buffer.add_string buf "&quot;" 142 + | c -> Buffer.add_char buf c 143 + ) value; 144 + Buffer.contents buf 145 + 146 + let serialize_node node = 147 + match node.Dom.name with 148 + | "#text" -> 149 + (* Check if parent is a raw text element *) 150 + escape_text node.Dom.data 151 + | "#comment" -> 152 + "<!--" ^ node.Dom.data ^ "-->" 153 + | "!doctype" -> 154 + let buf = Buffer.create 64 in 155 + Buffer.add_string buf "<!DOCTYPE "; 156 + (match node.Dom.doctype with 157 + | Some dt -> 158 + Buffer.add_string buf (Option.value ~default:"html" dt.Dom.name); 159 + (match dt.Dom.public_id with 160 + | Some pub when pub <> "" -> 161 + Buffer.add_string buf " PUBLIC \""; 162 + Buffer.add_string buf pub; 163 + Buffer.add_char buf '"'; 164 + (match dt.Dom.system_id with 165 + | Some sys -> 166 + Buffer.add_string buf " \""; 167 + Buffer.add_string buf sys; 168 + Buffer.add_char buf '"' 169 + | None -> ()) 170 + | _ -> 171 + match dt.Dom.system_id with 172 + | Some sys when sys <> "" -> 173 + Buffer.add_string buf " SYSTEM \""; 174 + Buffer.add_string buf sys; 175 + Buffer.add_char buf '"' 176 + | _ -> ()) 177 + | None -> Buffer.add_string buf "html"); 178 + Buffer.add_char buf '>'; 179 + Buffer.contents buf 180 + | _ -> 181 + (* Element *) 182 + let buf = Buffer.create 64 in 183 + Buffer.add_char buf '<'; 184 + Buffer.add_string buf node.Dom.name; 185 + List.iter (fun (key, value) -> 186 + Buffer.add_char buf ' '; 187 + Buffer.add_string buf key; 188 + if can_unquote_attr_value value then begin 189 + Buffer.add_char buf '='; 190 + Buffer.add_string buf value 191 + end else begin 192 + let quote = choose_quote value in 193 + Buffer.add_char buf '='; 194 + Buffer.add_char buf quote; 195 + Buffer.add_string buf (escape_attr_value value quote); 196 + Buffer.add_char buf quote 197 + end 198 + ) node.Dom.attrs; 199 + Buffer.add_char buf '>'; 200 + Buffer.contents buf 201 + 202 + (* Run a single test *) 203 + let run_test test = 204 + try 205 + (* Build nodes from input tokens *) 206 + let nodes = List.filter_map build_node_from_token test.input in 207 + 208 + (* Serialize *) 209 + let serialized = String.concat "" (List.map serialize_node nodes) in 210 + 211 + (* Check if it matches any expected output *) 212 + let matches = List.exists (fun exp -> serialized = exp) test.expected in 213 + 214 + (matches, serialized, test.expected) 215 + with e -> 216 + (false, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), test.expected) 217 + 218 + (* Run all tests in a file *) 219 + let run_file path = 220 + let content = 221 + let ic = open_in path in 222 + let n = in_channel_length ic in 223 + let s = really_input_string ic n in 224 + close_in ic; 225 + s 226 + in 227 + 228 + let json = match Jsont_bytesrw.decode_string Jsont.json content with 229 + | Ok j -> j 230 + | Error e -> failwith (Printf.sprintf "JSON parse error in %s: %s" path e) 231 + in 232 + 233 + let obj = json_object json in 234 + let tests_json = match json_mem "tests" obj with 235 + | Some t -> json_array t 236 + | None -> [] 237 + in 238 + 239 + let filename = Filename.basename path in 240 + let passed = ref 0 in 241 + let failed = ref 0 in 242 + let first_failures = ref [] in 243 + 244 + List.iteri (fun i test_json -> 245 + try 246 + let test = parse_test_case test_json in 247 + let (success, actual, expected) = run_test test in 248 + 249 + if success then 250 + incr passed 251 + else begin 252 + incr failed; 253 + if List.length !first_failures < 3 then 254 + first_failures := (i + 1, test.description, actual, expected) :: !first_failures 255 + end 256 + with e -> 257 + incr failed; 258 + Printf.eprintf "Exception parsing test %d: %s\n" (i + 1) (Printexc.to_string e) 259 + ) tests_json; 260 + 261 + (!passed, !failed, List.rev !first_failures, filename) 262 + 263 + let () = 264 + let test_dir = Sys.argv.(1) in 265 + let files = Sys.readdir test_dir |> Array.to_list in 266 + let test_files = List.filter (fun f -> Filename.check_suffix f ".test") files in 267 + 268 + let total_passed = ref 0 in 269 + let total_failed = ref 0 in 270 + let all_failures = ref [] in 271 + 272 + List.iter (fun file -> 273 + let path = Filename.concat test_dir file in 274 + let (passed, failed, failures, filename) = run_file path in 275 + total_passed := !total_passed + passed; 276 + total_failed := !total_failed + failed; 277 + if failures <> [] then 278 + all_failures := (filename, failures) :: !all_failures; 279 + Printf.printf "%s: %d passed, %d failed\n" filename passed failed 280 + ) (List.sort String.compare test_files); 281 + 282 + Printf.printf "\n=== Summary ===\n"; 283 + Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed; 284 + 285 + if !all_failures <> [] then begin 286 + Printf.printf "\n=== First failures ===\n"; 287 + List.iter (fun (filename, failures) -> 288 + List.iter (fun (test_num, desc, actual, expected) -> 289 + Printf.printf "\n--- %s test %d ---\n" filename test_num; 290 + Printf.printf "Description: %s\n" desc; 291 + Printf.printf "Expected: %s\n" (String.concat " | " expected); 292 + Printf.printf "Actual: %s\n" actual 293 + ) failures 294 + ) (List.rev !all_failures) 295 + end; 296 + 297 + exit (if !total_failed > 0 then 1 else 0)
+386
test/test_tokenizer.ml
··· 1 + (* Test runner for html5lib-tests tokenizer tests *) 2 + 3 + open Bytesrw 4 + 5 + module Tokenizer = Html5rw_tokenizer 6 + 7 + (* Token collector sink - collects all tokens into a list *) 8 + module TokenCollector = struct 9 + type t = { 10 + mutable tokens : Tokenizer.Token.t list; 11 + } 12 + 13 + let create () = { tokens = [] } 14 + 15 + let process t token = 16 + t.tokens <- token :: t.tokens; 17 + `Continue 18 + 19 + let adjusted_current_node_in_html_namespace _ = true 20 + 21 + let get_tokens t = List.rev t.tokens 22 + end 23 + 24 + (* Test case representation *) 25 + type test_error = { 26 + code : string; 27 + line : int; 28 + col : int; 29 + } 30 + 31 + type test_case = { 32 + description : string; 33 + input : string; 34 + output : Jsont.json list; 35 + errors : test_error list; 36 + initial_states : string list; 37 + last_start_tag : string option; 38 + double_escaped : bool; 39 + } 40 + 41 + (* Unescape double-escaped strings from tests *) 42 + let unescape_double s = 43 + let b = Buffer.create (String.length s) in 44 + let i = ref 0 in 45 + while !i < String.length s do 46 + if !i + 1 < String.length s && s.[!i] = '\\' then begin 47 + match s.[!i + 1] with 48 + | 'u' when !i + 5 < String.length s -> 49 + let hex = String.sub s (!i + 2) 4 in 50 + (try 51 + let code = int_of_string ("0x" ^ hex) in 52 + if code < 128 then Buffer.add_char b (Char.chr code) 53 + else begin 54 + (* UTF-8 encode *) 55 + if code < 0x800 then begin 56 + Buffer.add_char b (Char.chr (0xC0 lor (code lsr 6))); 57 + Buffer.add_char b (Char.chr (0x80 lor (code land 0x3F))) 58 + end else begin 59 + Buffer.add_char b (Char.chr (0xE0 lor (code lsr 12))); 60 + Buffer.add_char b (Char.chr (0x80 lor ((code lsr 6) land 0x3F))); 61 + Buffer.add_char b (Char.chr (0x80 lor (code land 0x3F))) 62 + end 63 + end; 64 + i := !i + 6 65 + with _ -> 66 + Buffer.add_char b s.[!i]; 67 + incr i) 68 + | 'n' -> Buffer.add_char b '\n'; i := !i + 2 69 + | 'r' -> Buffer.add_char b '\r'; i := !i + 2 70 + | 't' -> Buffer.add_char b '\t'; i := !i + 2 71 + | '\\' -> Buffer.add_char b '\\'; i := !i + 2 72 + | _ -> Buffer.add_char b s.[!i]; incr i 73 + end else begin 74 + Buffer.add_char b s.[!i]; 75 + incr i 76 + end 77 + done; 78 + Buffer.contents b 79 + 80 + (* Extract string from JSON node *) 81 + let json_string = function 82 + | Jsont.String (s, _) -> s 83 + | _ -> failwith "Expected string" 84 + 85 + let json_bool = function 86 + | Jsont.Bool (b, _) -> b 87 + | _ -> failwith "Expected bool" 88 + 89 + let json_int = function 90 + | Jsont.Number (n, _) -> int_of_float n 91 + | _ -> failwith "Expected number" 92 + 93 + let json_array = function 94 + | Jsont.Array (arr, _) -> arr 95 + | _ -> failwith "Expected array" 96 + 97 + let json_object = function 98 + | Jsont.Object (obj, _) -> obj 99 + | _ -> failwith "Expected object" 100 + 101 + let json_mem name obj = 102 + match List.find_opt (fun ((n, _), _) -> n = name) obj with 103 + | Some (_, v) -> Some v 104 + | None -> None 105 + 106 + let json_mem_exn name obj = 107 + match json_mem name obj with 108 + | Some v -> v 109 + | None -> failwith ("Missing member: " ^ name) 110 + 111 + (* Parse test error from JSON *) 112 + let parse_test_error json = 113 + let obj = json_object json in 114 + { 115 + code = json_string (json_mem_exn "code" obj); 116 + line = json_int (json_mem_exn "line" obj); 117 + col = json_int (json_mem_exn "col" obj); 118 + } 119 + 120 + (* Parse a single test case from JSON *) 121 + let parse_test_case json = 122 + let obj = json_object json in 123 + let description = json_string (json_mem_exn "description" obj) in 124 + let input = json_string (json_mem_exn "input" obj) in 125 + let output = json_array (json_mem_exn "output" obj) in 126 + let errors = match json_mem "errors" obj with 127 + | Some e -> List.map parse_test_error (json_array e) 128 + | None -> [] 129 + in 130 + let initial_states = match json_mem "initialStates" obj with 131 + | Some s -> List.map json_string (json_array s) 132 + | None -> ["Data state"] 133 + in 134 + let last_start_tag = match json_mem "lastStartTag" obj with 135 + | Some s -> Some (json_string s) 136 + | None -> None 137 + in 138 + let double_escaped = match json_mem "doubleEscaped" obj with 139 + | Some b -> json_bool b 140 + | None -> false 141 + in 142 + { description; input; output; errors; initial_states; last_start_tag; double_escaped } 143 + 144 + (* Convert state name to State.t *) 145 + let state_of_string = function 146 + | "Data state" -> Tokenizer.State.Data 147 + | "PLAINTEXT state" -> Tokenizer.State.Plaintext 148 + | "RCDATA state" -> Tokenizer.State.Rcdata 149 + | "RAWTEXT state" -> Tokenizer.State.Rawtext 150 + | "Script data state" -> Tokenizer.State.Script_data 151 + | "CDATA section state" -> Tokenizer.State.Cdata_section 152 + | s -> failwith ("Unknown state: " ^ s) 153 + 154 + (* Convert our token to test format for comparison *) 155 + let token_to_test_json (tok : Tokenizer.Token.t) : Jsont.json list = 156 + let str s = Jsont.String (s, Jsont.Meta.none) in 157 + let arr l = Jsont.Array (l, Jsont.Meta.none) in 158 + match tok with 159 + | Tokenizer.Token.Doctype { name; public_id; system_id; force_quirks } -> 160 + let name_json = match name with 161 + | Some n -> str n 162 + | None -> Jsont.Null ((), Jsont.Meta.none) 163 + in 164 + let public_json = match public_id with 165 + | Some p -> str p 166 + | None -> Jsont.Null ((), Jsont.Meta.none) 167 + in 168 + let system_json = match system_id with 169 + | Some s -> str s 170 + | None -> Jsont.Null ((), Jsont.Meta.none) 171 + in 172 + let correctness = Jsont.Bool (not force_quirks, Jsont.Meta.none) in 173 + [arr [str "DOCTYPE"; name_json; public_json; system_json; correctness]] 174 + | Tokenizer.Token.Tag { kind = Start; name; attrs; self_closing } -> 175 + let attrs_obj = Jsont.Object ( 176 + List.map (fun (n, v) -> ((n, Jsont.Meta.none), str v)) (List.rev attrs), 177 + Jsont.Meta.none 178 + ) in 179 + if self_closing then 180 + [arr [str "StartTag"; str name; attrs_obj; Jsont.Bool (true, Jsont.Meta.none)]] 181 + else 182 + [arr [str "StartTag"; str name; attrs_obj]] 183 + | Tokenizer.Token.Tag { kind = End; name; _ } -> 184 + [arr [str "EndTag"; str name]] 185 + | Tokenizer.Token.Comment data -> 186 + [arr [str "Comment"; str data]] 187 + | Tokenizer.Token.Character data -> 188 + (* Split into individual characters for comparison - but actually 189 + the tests expect consecutive characters to be merged *) 190 + [arr [str "Character"; str data]] 191 + | Tokenizer.Token.EOF -> [] 192 + 193 + (* Compare JSON values for equality *) 194 + let rec json_equal a b = 195 + match a, b with 196 + | Jsont.Null _, Jsont.Null _ -> true 197 + | Jsont.Bool (a, _), Jsont.Bool (b, _) -> a = b 198 + | Jsont.Number (a, _), Jsont.Number (b, _) -> a = b 199 + | Jsont.String (a, _), Jsont.String (b, _) -> a = b 200 + | Jsont.Array (a, _), Jsont.Array (b, _) -> 201 + List.length a = List.length b && 202 + List.for_all2 json_equal a b 203 + | Jsont.Object (a, _), Jsont.Object (b, _) -> 204 + let a_sorted = List.sort (fun ((n1, _), _) ((n2, _), _) -> String.compare n1 n2) a in 205 + let b_sorted = List.sort (fun ((n1, _), _) ((n2, _), _) -> String.compare n1 n2) b in 206 + List.length a_sorted = List.length b_sorted && 207 + List.for_all2 (fun ((n1, _), v1) ((n2, _), v2) -> n1 = n2 && json_equal v1 v2) a_sorted b_sorted 208 + | _ -> false 209 + 210 + (* Merge consecutive Character tokens *) 211 + let merge_character_tokens tokens = 212 + let rec loop acc = function 213 + | [] -> List.rev acc 214 + | Tokenizer.Token.Character s1 :: Tokenizer.Token.Character s2 :: rest -> 215 + loop acc (Tokenizer.Token.Character (s1 ^ s2) :: rest) 216 + | tok :: rest -> loop (tok :: acc) rest 217 + in 218 + loop [] tokens 219 + 220 + (* Run a single test *) 221 + let run_test test initial_state = 222 + let input = if test.double_escaped then unescape_double test.input else test.input in 223 + 224 + let collector = TokenCollector.create () in 225 + let tokenizer = Tokenizer.create (module TokenCollector) collector ~collect_errors:true () in 226 + 227 + (* Set initial state *) 228 + Tokenizer.set_state tokenizer initial_state; 229 + 230 + (* Set last start tag if specified *) 231 + (match test.last_start_tag with 232 + | Some tag -> Tokenizer.set_last_start_tag tokenizer tag 233 + | None -> ()); 234 + 235 + (* Run tokenizer *) 236 + let reader = Bytes.Reader.of_string input in 237 + Tokenizer.run tokenizer (module TokenCollector) reader; 238 + 239 + (* Get results *) 240 + let tokens = merge_character_tokens (TokenCollector.get_tokens collector) in 241 + let actual_tokens = List.concat_map token_to_test_json tokens in 242 + 243 + (* Unescape expected output if double_escaped *) 244 + let expected_output = if test.double_escaped then 245 + let rec unescape_json = function 246 + | Jsont.String (s, m) -> Jsont.String (unescape_double s, m) 247 + | Jsont.Array (arr, m) -> Jsont.Array (List.map unescape_json arr, m) 248 + | Jsont.Object (obj, m) -> 249 + Jsont.Object (List.map (fun (n, v) -> (n, unescape_json v)) obj, m) 250 + | other -> other 251 + in 252 + List.map unescape_json test.output 253 + else test.output 254 + in 255 + 256 + (* Merge consecutive Character tokens in expected output too *) 257 + let rec merge_expected = function 258 + | [] -> [] 259 + | [x] -> [x] 260 + | Jsont.Array ([Jsont.String ("Character", _); Jsont.String (s1, m1)], am1) :: 261 + Jsont.Array ([Jsont.String ("Character", _); Jsont.String (s2, _)], _) :: 262 + rest -> 263 + merge_expected (Jsont.Array ([Jsont.String ("Character", Jsont.Meta.none); Jsont.String (s1 ^ s2, m1)], am1) :: rest) 264 + | x :: rest -> x :: merge_expected rest 265 + in 266 + let expected = merge_expected expected_output in 267 + 268 + (* Compare *) 269 + let tokens_match = 270 + List.length actual_tokens = List.length expected && 271 + List.for_all2 json_equal actual_tokens expected 272 + in 273 + 274 + let actual_errors = Tokenizer.get_errors tokenizer in 275 + let errors_count_match = List.length actual_errors = List.length test.errors in 276 + 277 + (tokens_match && errors_count_match, actual_tokens, expected, actual_errors, test.errors) 278 + 279 + (* Format JSON for display *) 280 + let rec json_to_string = function 281 + | Jsont.Null _ -> "null" 282 + | Jsont.Bool (b, _) -> string_of_bool b 283 + | Jsont.Number (n, _) -> Printf.sprintf "%g" n 284 + | Jsont.String (s, _) -> Printf.sprintf "%S" s 285 + | Jsont.Array (arr, _) -> 286 + "[" ^ String.concat ", " (List.map json_to_string arr) ^ "]" 287 + | Jsont.Object (obj, _) -> 288 + "{" ^ String.concat ", " (List.map (fun ((n, _), v) -> Printf.sprintf "%S: %s" n (json_to_string v)) obj) ^ "}" 289 + 290 + (* Run all tests in a file *) 291 + let run_file path = 292 + let content = 293 + let ic = open_in path in 294 + let n = in_channel_length ic in 295 + let s = really_input_string ic n in 296 + close_in ic; 297 + s 298 + in 299 + 300 + (* Parse JSON *) 301 + let json = match Jsont_bytesrw.decode_string Jsont.json content with 302 + | Ok j -> j 303 + | Error e -> failwith (Printf.sprintf "JSON parse error in %s: %s" path e) 304 + in 305 + 306 + let obj = json_object json in 307 + 308 + (* Handle both {"tests": [...]} and {"xmlViolationTests": [...], "tests": [...]} formats *) 309 + let test_arrays = 310 + let tests = match json_mem "tests" obj with 311 + | Some t -> json_array t 312 + | None -> [] 313 + in 314 + let xml_tests = match json_mem "xmlViolationTests" obj with 315 + | Some t -> json_array t 316 + | None -> [] 317 + in 318 + tests @ xml_tests 319 + in 320 + 321 + let filename = Filename.basename path in 322 + let passed = ref 0 in 323 + let failed = ref 0 in 324 + let first_failures = ref [] in 325 + 326 + List.iteri (fun i test_json -> 327 + let test = parse_test_case test_json in 328 + 329 + (* Run for each initial state *) 330 + List.iter (fun state_name -> 331 + try 332 + let state = state_of_string state_name in 333 + let (success, actual, expected, actual_errors, expected_errors) = run_test test state in 334 + 335 + if success then 336 + incr passed 337 + else begin 338 + incr failed; 339 + if List.length !first_failures < 3 then 340 + first_failures := (i + 1, test.description, state_name, actual, expected, actual_errors, expected_errors) :: !first_failures 341 + end 342 + with e -> 343 + incr failed; 344 + if List.length !first_failures < 3 then 345 + first_failures := (i + 1, test.description, state_name, [], [], [], []) :: !first_failures; 346 + Printf.eprintf "Exception in test %d (%s): %s\n" (i + 1) test.description (Printexc.to_string e) 347 + ) test.initial_states 348 + ) test_arrays; 349 + 350 + (!passed, !failed, List.rev !first_failures, filename) 351 + 352 + let () = 353 + let test_dir = Sys.argv.(1) in 354 + let files = Sys.readdir test_dir |> Array.to_list in 355 + let test_files = List.filter (fun f -> Filename.check_suffix f ".test") files in 356 + 357 + let total_passed = ref 0 in 358 + let total_failed = ref 0 in 359 + let all_failures = ref [] in 360 + 361 + List.iter (fun file -> 362 + let path = Filename.concat test_dir file in 363 + let (passed, failed, failures, filename) = run_file path in 364 + total_passed := !total_passed + passed; 365 + total_failed := !total_failed + failed; 366 + if failures <> [] then 367 + all_failures := (filename, failures) :: !all_failures; 368 + Printf.printf "%s: %d passed, %d failed\n" filename passed failed 369 + ) (List.sort String.compare test_files); 370 + 371 + Printf.printf "\n=== Summary ===\n"; 372 + Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed; 373 + 374 + if !all_failures <> [] then begin 375 + Printf.printf "\n=== First failures ===\n"; 376 + List.iter (fun (filename, failures) -> 377 + List.iter (fun (test_num, desc, state, actual, expected, actual_errs, expected_errs) -> 378 + Printf.printf "\n--- %s test %d (%s) in %s ---\n" filename test_num state desc; 379 + Printf.printf "Expected tokens: [%s]\n" (String.concat "; " (List.map json_to_string expected)); 380 + Printf.printf "Actual tokens: [%s]\n" (String.concat "; " (List.map json_to_string actual)); 381 + Printf.printf "Expected %d errors, got %d\n" (List.length expected_errs) (List.length actual_errs) 382 + ) failures 383 + ) (List.rev !all_failures) 384 + end; 385 + 386 + exit (if !total_failed > 0 then 1 else 0)