OCaml HTML5 parser/serialiser based on Python's JustHTML

roundtrip

Changed files
+200
test
+5
test/dune
··· 99 (name debug_check) 100 (modules debug_check) 101 (libraries html5rw.checker bytesrw))
··· 99 (name debug_check) 100 (modules debug_check) 101 (libraries html5rw.checker bytesrw)) 102 + 103 + (executable 104 + (name test_roundtrip) 105 + (modules test_roundtrip) 106 + (libraries bytesrw html5rw html5rw.checker astring test_report))
+195
test/test_roundtrip.ml
···
··· 1 + (* Roundtrip test: Parse -> Serialize -> Re-parse -> Validate 2 + 3 + This test validates that the HTML5 serializer produces valid HTML5 4 + by roundtripping the validator test suite files through: 5 + 1. Parse with HTML5 parser 6 + 2. Serialize DOM back to HTML 7 + 3. Re-parse the serialized HTML 8 + 4. Validate the result 9 + 10 + For "isvalid" tests: the roundtripped document should still be valid 11 + For "novalid/haswarn" tests: we just verify the roundtrip works without crashes 12 + *) 13 + 14 + module Report = Test_report 15 + 16 + (* Test result type *) 17 + type test_result = { 18 + filename : string; 19 + test_type : string; (* isvalid, novalid, haswarn *) 20 + original_valid : bool; (* Did original pass validation? *) 21 + roundtrip_valid : bool; (* Did roundtripped doc pass validation? *) 22 + roundtrip_ok : bool; (* Did roundtrip work without errors? *) 23 + original_errors : int; 24 + roundtrip_errors : int; 25 + parse_error : string option; 26 + } 27 + 28 + (* Get test type from filename *) 29 + let get_test_type filename = 30 + if Astring.String.is_suffix ~affix:"-isvalid.html" filename || 31 + Astring.String.is_suffix ~affix:"-isvalid.xhtml" filename then "isvalid" 32 + else if Astring.String.is_suffix ~affix:"-novalid.html" filename || 33 + Astring.String.is_suffix ~affix:"-novalid.xhtml" filename then "novalid" 34 + else if Astring.String.is_suffix ~affix:"-haswarn.html" filename || 35 + Astring.String.is_suffix ~affix:"-haswarn.xhtml" filename then "haswarn" 36 + else "unknown" 37 + 38 + (* Count errors in validation result *) 39 + let count_errors messages = 40 + List.length (List.filter (fun (m : Html5_checker.Message.t) -> 41 + m.severity = Html5_checker.Message.Error 42 + ) messages) 43 + 44 + (* Serialize a document to HTML string *) 45 + let serialize_document doc = 46 + Html5rw.Dom.to_html ~pretty:false doc 47 + 48 + (* Run roundtrip test on a single file *) 49 + let test_file path = 50 + let filename = Filename.basename path in 51 + let test_type = get_test_type filename in 52 + 53 + try 54 + (* Read file content *) 55 + let content = 56 + let ic = open_in path in 57 + let n = in_channel_length ic in 58 + let s = really_input_string ic n in 59 + close_in ic; 60 + s 61 + in 62 + 63 + (* Parse original *) 64 + let original_result = Html5rw.parse_bytes (Bytes.of_string content) in 65 + let original_doc = Html5rw.root original_result in 66 + 67 + (* Validate original *) 68 + let checker_result = Html5_checker.check_dom ~system_id:path original_result in 69 + let original_messages = Html5_checker.messages checker_result in 70 + let original_errors = count_errors original_messages in 71 + let original_valid = original_errors = 0 in 72 + 73 + (* Serialize to HTML *) 74 + let serialized = serialize_document original_doc in 75 + 76 + (* Re-parse serialized HTML *) 77 + let roundtrip_result = Html5rw.parse_bytes (Bytes.of_string serialized) in 78 + 79 + (* Validate roundtripped document *) 80 + let roundtrip_checker = Html5_checker.check_dom ~system_id:path roundtrip_result in 81 + let roundtrip_messages = Html5_checker.messages roundtrip_checker in 82 + let roundtrip_errors = count_errors roundtrip_messages in 83 + let roundtrip_valid = roundtrip_errors = 0 in 84 + 85 + { 86 + filename; 87 + test_type; 88 + original_valid; 89 + roundtrip_valid; 90 + roundtrip_ok = true; 91 + original_errors; 92 + roundtrip_errors; 93 + parse_error = None; 94 + } 95 + with e -> 96 + { 97 + filename; 98 + test_type; 99 + original_valid = false; 100 + roundtrip_valid = false; 101 + roundtrip_ok = false; 102 + original_errors = 0; 103 + roundtrip_errors = 0; 104 + parse_error = Some (Printexc.to_string e); 105 + } 106 + 107 + (* Recursively find all test files *) 108 + let rec find_test_files dir = 109 + let files = Sys.readdir dir |> Array.to_list in 110 + List.concat_map (fun f -> 111 + let path = Filename.concat dir f in 112 + if Sys.is_directory path then 113 + find_test_files path 114 + else if Astring.String.is_suffix ~affix:"-isvalid.html" f || 115 + Astring.String.is_suffix ~affix:"-novalid.html" f || 116 + Astring.String.is_suffix ~affix:"-haswarn.html" f then 117 + [path] 118 + else 119 + [] 120 + ) files 121 + 122 + let () = 123 + let test_dir = Sys.argv.(1) in 124 + 125 + Printf.printf "Discovering test files...\n%!"; 126 + let test_files = find_test_files test_dir in 127 + Printf.printf "Found %d test files\n%!" (List.length test_files); 128 + 129 + Printf.printf "Running roundtrip tests...\n%!"; 130 + 131 + (* Run tests *) 132 + let results = List.map test_file test_files in 133 + 134 + (* Categorize results *) 135 + let isvalid_tests = List.filter (fun r -> r.test_type = "isvalid") results in 136 + let novalid_tests = List.filter (fun r -> r.test_type = "novalid") results in 137 + let haswarn_tests = List.filter (fun r -> r.test_type = "haswarn") results in 138 + 139 + (* For isvalid tests: check that roundtripped document is still valid *) 140 + let isvalid_passed = List.filter (fun r -> 141 + r.roundtrip_ok && r.roundtrip_valid 142 + ) isvalid_tests in 143 + 144 + (* For novalid/haswarn tests: just check roundtrip works *) 145 + let novalid_passed = List.filter (fun r -> r.roundtrip_ok) novalid_tests in 146 + let haswarn_passed = List.filter (fun r -> r.roundtrip_ok) haswarn_tests in 147 + 148 + (* Print failures for isvalid tests *) 149 + let isvalid_failed = List.filter (fun r -> 150 + not r.roundtrip_ok || not r.roundtrip_valid 151 + ) isvalid_tests in 152 + 153 + if List.length isvalid_failed > 0 then begin 154 + Printf.printf "\n=== Failing isvalid roundtrip tests (first 20) ===\n"; 155 + List.iteri (fun i r -> 156 + if i < 20 then begin 157 + match r.parse_error with 158 + | Some err -> Printf.printf "%s: %s\n" r.filename err 159 + | None -> 160 + Printf.printf "%s: original_valid=%b, roundtrip_valid=%b (errors: %d -> %d)\n" 161 + r.filename r.original_valid r.roundtrip_valid 162 + r.original_errors r.roundtrip_errors 163 + end 164 + ) isvalid_failed 165 + end; 166 + 167 + (* Print roundtrip failures for all tests *) 168 + let roundtrip_failures = List.filter (fun r -> not r.roundtrip_ok) results in 169 + if List.length roundtrip_failures > 0 then begin 170 + Printf.printf "\n=== Roundtrip failures (first 20) ===\n"; 171 + List.iteri (fun i r -> 172 + if i < 20 then 173 + Printf.printf "%s: %s\n" r.filename 174 + (Option.value ~default:"unknown error" r.parse_error) 175 + ) roundtrip_failures 176 + end; 177 + 178 + (* Summary *) 179 + Printf.printf "\n=== Roundtrip Test Results ===\n"; 180 + Printf.printf "isvalid tests: %d/%d passed (roundtripped and still valid)\n" 181 + (List.length isvalid_passed) (List.length isvalid_tests); 182 + Printf.printf "novalid tests: %d/%d roundtripped successfully\n" 183 + (List.length novalid_passed) (List.length novalid_tests); 184 + Printf.printf "haswarn tests: %d/%d roundtripped successfully\n" 185 + (List.length haswarn_passed) (List.length haswarn_tests); 186 + 187 + let total_roundtrip_ok = List.length (List.filter (fun r -> r.roundtrip_ok) results) in 188 + Printf.printf "\nTotal: %d/%d roundtripped without errors\n" 189 + total_roundtrip_ok (List.length results); 190 + Printf.printf "isvalid preservation: %d/%d still valid after roundtrip\n" 191 + (List.length isvalid_passed) (List.length isvalid_tests); 192 + 193 + (* Exit with error if isvalid tests fail validation after roundtrip *) 194 + let exit_code = if List.length isvalid_failed > 0 then 1 else 0 in 195 + exit exit_code