OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 7.2 kB view raw
1(* Roundtrip test: Parse -> Serialize -> Re-parse -> Validate 2 3 This test validates that the HTML5 serializer produces valid HTML5 4 by roundtripping the validator test suite files through: 5 1. Parse with HTML5 parser 6 2. Serialize DOM back to HTML 7 3. Re-parse the serialized HTML 8 4. Validate the result 9 10 For "isvalid" tests: the roundtripped document should still be valid 11 For "novalid/haswarn" tests: we just verify the roundtrip works without crashes 12*) 13 14module Report = Test_report 15 16(* Test result type *) 17type test_result = { 18 filename : string; 19 test_type : string; (* isvalid, novalid, haswarn *) 20 original_valid : bool; (* Did original pass validation? *) 21 roundtrip_valid : bool; (* Did roundtripped doc pass validation? *) 22 roundtrip_ok : bool; (* Did roundtrip work without errors? *) 23 original_errors : int; 24 roundtrip_errors : int; 25 parse_error : string option; 26} 27 28(* Get test type from filename *) 29let get_test_type filename = 30 if Astring.String.is_suffix ~affix:"-isvalid.html" filename || 31 Astring.String.is_suffix ~affix:"-isvalid.xhtml" filename then "isvalid" 32 else if Astring.String.is_suffix ~affix:"-novalid.html" filename || 33 Astring.String.is_suffix ~affix:"-novalid.xhtml" filename then "novalid" 34 else if Astring.String.is_suffix ~affix:"-haswarn.html" filename || 35 Astring.String.is_suffix ~affix:"-haswarn.xhtml" filename then "haswarn" 36 else "unknown" 37 38(* Count errors in validation result *) 39let count_errors messages = 40 List.length (List.filter (fun (m : Htmlrw_check.message) -> 41 m.severity = Htmlrw_check.Error 42 ) messages) 43 44(* Serialize a document to HTML string *) 45let serialize_document doc = 46 Html5rw.Dom.to_html ~pretty:false doc 47 48(* Run roundtrip test on a single file *) 49let test_file path = 50 let filename = Filename.basename path in 51 let test_type = get_test_type filename in 52 53 try 54 (* Read file content *) 55 let content = 56 let ic = open_in path in 57 let n = in_channel_length ic in 58 let s = really_input_string ic n in 59 close_in ic; 60 s 61 in 62 63 (* Parse original *) 64 let original_result = Html5rw.parse_bytes (Bytes.of_string content) in 65 let original_doc = Html5rw.root original_result in 66 67 (* Validate original *) 68 let checker_result = Htmlrw_check.check_parsed ~system_id:path original_result in 69 let original_messages = Htmlrw_check.messages checker_result in 70 let original_errors = count_errors original_messages in 71 let original_valid = original_errors = 0 in 72 73 (* Serialize to HTML *) 74 let serialized = serialize_document original_doc in 75 76 (* Re-parse serialized HTML *) 77 let roundtrip_result = Html5rw.parse_bytes (Bytes.of_string serialized) in 78 79 (* Validate roundtripped document *) 80 let roundtrip_checker = Htmlrw_check.check_parsed ~system_id:path roundtrip_result in 81 let roundtrip_messages = Htmlrw_check.messages roundtrip_checker in 82 let roundtrip_errors = count_errors roundtrip_messages in 83 let roundtrip_valid = roundtrip_errors = 0 in 84 85 { 86 filename; 87 test_type; 88 original_valid; 89 roundtrip_valid; 90 roundtrip_ok = true; 91 original_errors; 92 roundtrip_errors; 93 parse_error = None; 94 } 95 with e -> 96 { 97 filename; 98 test_type; 99 original_valid = false; 100 roundtrip_valid = false; 101 roundtrip_ok = false; 102 original_errors = 0; 103 roundtrip_errors = 0; 104 parse_error = Some (Printexc.to_string e); 105 } 106 107(* Recursively find all test files *) 108let rec find_test_files dir = 109 let files = Sys.readdir dir |> Array.to_list in 110 List.concat_map (fun f -> 111 let path = Filename.concat dir f in 112 if Sys.is_directory path then 113 find_test_files path 114 else if Astring.String.is_suffix ~affix:"-isvalid.html" f || 115 Astring.String.is_suffix ~affix:"-novalid.html" f || 116 Astring.String.is_suffix ~affix:"-haswarn.html" f then 117 [path] 118 else 119 [] 120 ) files 121 122let () = 123 let test_dir = Sys.argv.(1) in 124 125 Printf.printf "Discovering test files...\n%!"; 126 let test_files = find_test_files test_dir in 127 Printf.printf "Found %d test files\n%!" (List.length test_files); 128 129 Printf.printf "Running roundtrip tests...\n%!"; 130 131 (* Run tests *) 132 let total = List.length test_files in 133 let results = List.mapi (fun i path -> 134 Printf.printf "\r[%d/%d] %s%!" (i + 1) total (Filename.basename path); 135 test_file path 136 ) test_files in 137 Printf.printf "\n%!"; 138 139 (* Categorize results *) 140 let isvalid_tests = List.filter (fun r -> r.test_type = "isvalid") results in 141 let novalid_tests = List.filter (fun r -> r.test_type = "novalid") results in 142 let haswarn_tests = List.filter (fun r -> r.test_type = "haswarn") results in 143 144 (* For isvalid tests: check that roundtripped document is still valid *) 145 let isvalid_passed = List.filter (fun r -> 146 r.roundtrip_ok && r.roundtrip_valid 147 ) isvalid_tests in 148 149 (* For novalid/haswarn tests: just check roundtrip works *) 150 let novalid_passed = List.filter (fun r -> r.roundtrip_ok) novalid_tests in 151 let haswarn_passed = List.filter (fun r -> r.roundtrip_ok) haswarn_tests in 152 153 (* Print failures for isvalid tests *) 154 let isvalid_failed = List.filter (fun r -> 155 not r.roundtrip_ok || not r.roundtrip_valid 156 ) isvalid_tests in 157 158 if List.length isvalid_failed > 0 then begin 159 Printf.printf "\n=== Failing isvalid roundtrip tests (first 20) ===\n"; 160 List.iteri (fun i r -> 161 if i < 20 then begin 162 match r.parse_error with 163 | Some err -> Printf.printf "%s: %s\n" r.filename err 164 | None -> 165 Printf.printf "%s: original_valid=%b, roundtrip_valid=%b (errors: %d -> %d)\n" 166 r.filename r.original_valid r.roundtrip_valid 167 r.original_errors r.roundtrip_errors 168 end 169 ) isvalid_failed 170 end; 171 172 (* Print roundtrip failures for all tests *) 173 let roundtrip_failures = List.filter (fun r -> not r.roundtrip_ok) results in 174 if List.length roundtrip_failures > 0 then begin 175 Printf.printf "\n=== Roundtrip failures (first 20) ===\n"; 176 List.iteri (fun i r -> 177 if i < 20 then 178 Printf.printf "%s: %s\n" r.filename 179 (Option.value ~default:"unknown error" r.parse_error) 180 ) roundtrip_failures 181 end; 182 183 (* Summary *) 184 Printf.printf "\n=== Roundtrip Test Results ===\n"; 185 Printf.printf "isvalid tests: %d/%d passed (roundtripped and still valid)\n" 186 (List.length isvalid_passed) (List.length isvalid_tests); 187 Printf.printf "novalid tests: %d/%d roundtripped successfully\n" 188 (List.length novalid_passed) (List.length novalid_tests); 189 Printf.printf "haswarn tests: %d/%d roundtripped successfully\n" 190 (List.length haswarn_passed) (List.length haswarn_tests); 191 192 let total_roundtrip_ok = List.length (List.filter (fun r -> r.roundtrip_ok) results) in 193 Printf.printf "\nTotal: %d/%d roundtripped without errors\n" 194 total_roundtrip_ok (List.length results); 195 Printf.printf "isvalid preservation: %d/%d still valid after roundtrip\n" 196 (List.length isvalid_passed) (List.length isvalid_tests); 197 198 (* Exit with error if isvalid tests fail validation after roundtrip *) 199 let exit_code = if List.length isvalid_failed > 0 then 1 else 0 in 200 exit exit_code