OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* Roundtrip test: Parse -> Serialize -> Re-parse -> Validate 2 3 This test validates that the HTML5 serializer produces valid HTML5 4 by roundtripping the validator test suite files through: 5 1. Parse with HTML5 parser 6 2. Serialize DOM back to HTML 7 3. Re-parse the serialized HTML 8 4. Validate the result 9 10 For "isvalid" tests: the roundtripped document should still be valid 11 For "novalid/haswarn" tests: we just verify the roundtrip works without crashes 12*) 13 14module Report = Test_report 15 16(* Test result type *) 17type test_result = { 18 filename : string; 19 test_type : string; (* isvalid, novalid, haswarn *) 20 original_valid : bool; (* Did original pass validation? *) 21 roundtrip_valid : bool; (* Did roundtripped doc pass validation? *) 22 roundtrip_ok : bool; (* Did roundtrip work without errors? *) 23 original_errors : int; 24 roundtrip_errors : int; 25 parse_error : string option; 26} 27 28(* Get test type from filename *) 29let get_test_type filename = 30 if Astring.String.is_suffix ~affix:"-isvalid.html" filename || 31 Astring.String.is_suffix ~affix:"-isvalid.xhtml" filename then "isvalid" 32 else if Astring.String.is_suffix ~affix:"-novalid.html" filename || 33 Astring.String.is_suffix ~affix:"-novalid.xhtml" filename then "novalid" 34 else if Astring.String.is_suffix ~affix:"-haswarn.html" filename || 35 Astring.String.is_suffix ~affix:"-haswarn.xhtml" filename then "haswarn" 36 else "unknown" 37 38(* Count errors in validation result *) 39let count_errors messages = 40 List.length (List.filter (fun (m : Htmlrw_check.message) -> 41 m.severity = Htmlrw_check.Error 42 ) messages) 43 44(* Serialize a document to HTML string *) 45let serialize_document doc = 46 Html5rw.Dom.to_html ~pretty:false doc 47 48(* Run roundtrip test on a single file *) 49let test_file path = 50 let filename = Filename.basename path in 51 let test_type = get_test_type filename in 52 53 try 54 (* Read file content *) 55 let content = 56 let ic = open_in path in 57 let n = in_channel_length ic in 58 let s = really_input_string ic n in 59 close_in ic; 60 s 61 in 62 63 (* Parse original *) 64 let original_result = Html5rw.parse_bytes (Bytes.of_string content) in 65 let original_doc = Html5rw.root original_result in 66 67 (* Validate original *) 68 let checker_result = Htmlrw_check.check_parsed ~system_id:path original_result in 69 let original_messages = Htmlrw_check.messages checker_result in 70 let original_errors = count_errors original_messages in 71 let original_valid = original_errors = 0 in 72 73 (* Serialize to HTML *) 74 let serialized = serialize_document original_doc in 75 76 (* Re-parse serialized HTML *) 77 let roundtrip_result = Html5rw.parse_bytes (Bytes.of_string serialized) in 78 79 (* Validate roundtripped document *) 80 let roundtrip_checker = Htmlrw_check.check_parsed ~system_id:path roundtrip_result in 81 let roundtrip_messages = Htmlrw_check.messages roundtrip_checker in 82 let roundtrip_errors = count_errors roundtrip_messages in 83 let roundtrip_valid = roundtrip_errors = 0 in 84 85 { 86 filename; 87 test_type; 88 original_valid; 89 roundtrip_valid; 90 roundtrip_ok = true; 91 original_errors; 92 roundtrip_errors; 93 parse_error = None; 94 } 95 with e -> 96 { 97 filename; 98 test_type; 99 original_valid = false; 100 roundtrip_valid = false; 101 roundtrip_ok = false; 102 original_errors = 0; 103 roundtrip_errors = 0; 104 parse_error = Some (Printexc.to_string e); 105 } 106 107(* Recursively find all test files *) 108let rec find_test_files dir = 109 let files = Sys.readdir dir |> Array.to_list in 110 List.concat_map (fun f -> 111 let path = Filename.concat dir f in 112 if Sys.is_directory path then 113 find_test_files path 114 else if Astring.String.is_suffix ~affix:"-isvalid.html" f || 115 Astring.String.is_suffix ~affix:"-novalid.html" f || 116 Astring.String.is_suffix ~affix:"-haswarn.html" f then 117 [path] 118 else 119 [] 120 ) files 121 122let () = 123 let test_dir = Sys.argv.(1) in 124 125 Printf.printf "Discovering test files...\n%!"; 126 let test_files = find_test_files test_dir in 127 Printf.printf "Found %d test files\n%!" (List.length test_files); 128 129 Printf.printf "Running roundtrip tests...\n%!"; 130 131 (* Run tests *) 132 let results = List.map test_file test_files in 133 134 (* Categorize results *) 135 let isvalid_tests = List.filter (fun r -> r.test_type = "isvalid") results in 136 let novalid_tests = List.filter (fun r -> r.test_type = "novalid") results in 137 let haswarn_tests = List.filter (fun r -> r.test_type = "haswarn") results in 138 139 (* For isvalid tests: check that roundtripped document is still valid *) 140 let isvalid_passed = List.filter (fun r -> 141 r.roundtrip_ok && r.roundtrip_valid 142 ) isvalid_tests in 143 144 (* For novalid/haswarn tests: just check roundtrip works *) 145 let novalid_passed = List.filter (fun r -> r.roundtrip_ok) novalid_tests in 146 let haswarn_passed = List.filter (fun r -> r.roundtrip_ok) haswarn_tests in 147 148 (* Print failures for isvalid tests *) 149 let isvalid_failed = List.filter (fun r -> 150 not r.roundtrip_ok || not r.roundtrip_valid 151 ) isvalid_tests in 152 153 if List.length isvalid_failed > 0 then begin 154 Printf.printf "\n=== Failing isvalid roundtrip tests (first 20) ===\n"; 155 List.iteri (fun i r -> 156 if i < 20 then begin 157 match r.parse_error with 158 | Some err -> Printf.printf "%s: %s\n" r.filename err 159 | None -> 160 Printf.printf "%s: original_valid=%b, roundtrip_valid=%b (errors: %d -> %d)\n" 161 r.filename r.original_valid r.roundtrip_valid 162 r.original_errors r.roundtrip_errors 163 end 164 ) isvalid_failed 165 end; 166 167 (* Print roundtrip failures for all tests *) 168 let roundtrip_failures = List.filter (fun r -> not r.roundtrip_ok) results in 169 if List.length roundtrip_failures > 0 then begin 170 Printf.printf "\n=== Roundtrip failures (first 20) ===\n"; 171 List.iteri (fun i r -> 172 if i < 20 then 173 Printf.printf "%s: %s\n" r.filename 174 (Option.value ~default:"unknown error" r.parse_error) 175 ) roundtrip_failures 176 end; 177 178 (* Summary *) 179 Printf.printf "\n=== Roundtrip Test Results ===\n"; 180 Printf.printf "isvalid tests: %d/%d passed (roundtripped and still valid)\n" 181 (List.length isvalid_passed) (List.length isvalid_tests); 182 Printf.printf "novalid tests: %d/%d roundtripped successfully\n" 183 (List.length novalid_passed) (List.length novalid_tests); 184 Printf.printf "haswarn tests: %d/%d roundtripped successfully\n" 185 (List.length haswarn_passed) (List.length haswarn_tests); 186 187 let total_roundtrip_ok = List.length (List.filter (fun r -> r.roundtrip_ok) results) in 188 Printf.printf "\nTotal: %d/%d roundtripped without errors\n" 189 total_roundtrip_ok (List.length results); 190 Printf.printf "isvalid preservation: %d/%d still valid after roundtrip\n" 191 (List.length isvalid_passed) (List.length isvalid_tests); 192 193 (* Exit with error if isvalid tests fail validation after roundtrip *) 194 let exit_code = if List.length isvalid_failed > 0 then 1 else 0 in 195 exit exit_code