OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* Roundtrip test: Parse -> Serialize -> Re-parse -> Validate
2
3 This test validates that the HTML5 serializer produces valid HTML5
4 by roundtripping the validator test suite files through:
5 1. Parse with HTML5 parser
6 2. Serialize DOM back to HTML
7 3. Re-parse the serialized HTML
8 4. Validate the result
9
10 For "isvalid" tests: the roundtripped document should still be valid
11 For "novalid/haswarn" tests: we just verify the roundtrip works without crashes
12*)
13
14module Report = Test_report
15
16(* Test result type *)
17type test_result = {
18 filename : string;
19 test_type : string; (* isvalid, novalid, haswarn *)
20 original_valid : bool; (* Did original pass validation? *)
21 roundtrip_valid : bool; (* Did roundtripped doc pass validation? *)
22 roundtrip_ok : bool; (* Did roundtrip work without errors? *)
23 original_errors : int;
24 roundtrip_errors : int;
25 parse_error : string option;
26}
27
28(* Get test type from filename *)
29let get_test_type filename =
30 if Astring.String.is_suffix ~affix:"-isvalid.html" filename ||
31 Astring.String.is_suffix ~affix:"-isvalid.xhtml" filename then "isvalid"
32 else if Astring.String.is_suffix ~affix:"-novalid.html" filename ||
33 Astring.String.is_suffix ~affix:"-novalid.xhtml" filename then "novalid"
34 else if Astring.String.is_suffix ~affix:"-haswarn.html" filename ||
35 Astring.String.is_suffix ~affix:"-haswarn.xhtml" filename then "haswarn"
36 else "unknown"
37
38(* Count errors in validation result *)
39let count_errors messages =
40 List.length (List.filter (fun (m : Htmlrw_check.message) ->
41 m.severity = Htmlrw_check.Error
42 ) messages)
43
44(* Serialize a document to HTML string *)
45let serialize_document doc =
46 Html5rw.Dom.to_html ~pretty:false doc
47
48(* Run roundtrip test on a single file *)
49let test_file path =
50 let filename = Filename.basename path in
51 let test_type = get_test_type filename in
52
53 try
54 (* Read file content *)
55 let content =
56 let ic = open_in path in
57 let n = in_channel_length ic in
58 let s = really_input_string ic n in
59 close_in ic;
60 s
61 in
62
63 (* Parse original *)
64 let original_result = Html5rw.parse_bytes (Bytes.of_string content) in
65 let original_doc = Html5rw.root original_result in
66
67 (* Validate original *)
68 let checker_result = Htmlrw_check.check_parsed ~system_id:path original_result in
69 let original_messages = Htmlrw_check.messages checker_result in
70 let original_errors = count_errors original_messages in
71 let original_valid = original_errors = 0 in
72
73 (* Serialize to HTML *)
74 let serialized = serialize_document original_doc in
75
76 (* Re-parse serialized HTML *)
77 let roundtrip_result = Html5rw.parse_bytes (Bytes.of_string serialized) in
78
79 (* Validate roundtripped document *)
80 let roundtrip_checker = Htmlrw_check.check_parsed ~system_id:path roundtrip_result in
81 let roundtrip_messages = Htmlrw_check.messages roundtrip_checker in
82 let roundtrip_errors = count_errors roundtrip_messages in
83 let roundtrip_valid = roundtrip_errors = 0 in
84
85 {
86 filename;
87 test_type;
88 original_valid;
89 roundtrip_valid;
90 roundtrip_ok = true;
91 original_errors;
92 roundtrip_errors;
93 parse_error = None;
94 }
95 with e ->
96 {
97 filename;
98 test_type;
99 original_valid = false;
100 roundtrip_valid = false;
101 roundtrip_ok = false;
102 original_errors = 0;
103 roundtrip_errors = 0;
104 parse_error = Some (Printexc.to_string e);
105 }
106
107(* Recursively find all test files *)
108let rec find_test_files dir =
109 let files = Sys.readdir dir |> Array.to_list in
110 List.concat_map (fun f ->
111 let path = Filename.concat dir f in
112 if Sys.is_directory path then
113 find_test_files path
114 else if Astring.String.is_suffix ~affix:"-isvalid.html" f ||
115 Astring.String.is_suffix ~affix:"-novalid.html" f ||
116 Astring.String.is_suffix ~affix:"-haswarn.html" f then
117 [path]
118 else
119 []
120 ) files
121
122let () =
123 let test_dir = Sys.argv.(1) in
124
125 Printf.printf "Discovering test files...\n%!";
126 let test_files = find_test_files test_dir in
127 Printf.printf "Found %d test files\n%!" (List.length test_files);
128
129 Printf.printf "Running roundtrip tests...\n%!";
130
131 (* Run tests *)
132 let total = List.length test_files in
133 let results = List.mapi (fun i path ->
134 Printf.printf "\r[%d/%d] %s%!" (i + 1) total (Filename.basename path);
135 test_file path
136 ) test_files in
137 Printf.printf "\n%!";
138
139 (* Categorize results *)
140 let isvalid_tests = List.filter (fun r -> r.test_type = "isvalid") results in
141 let novalid_tests = List.filter (fun r -> r.test_type = "novalid") results in
142 let haswarn_tests = List.filter (fun r -> r.test_type = "haswarn") results in
143
144 (* For isvalid tests: check that roundtripped document is still valid *)
145 let isvalid_passed = List.filter (fun r ->
146 r.roundtrip_ok && r.roundtrip_valid
147 ) isvalid_tests in
148
149 (* For novalid/haswarn tests: just check roundtrip works *)
150 let novalid_passed = List.filter (fun r -> r.roundtrip_ok) novalid_tests in
151 let haswarn_passed = List.filter (fun r -> r.roundtrip_ok) haswarn_tests in
152
153 (* Print failures for isvalid tests *)
154 let isvalid_failed = List.filter (fun r ->
155 not r.roundtrip_ok || not r.roundtrip_valid
156 ) isvalid_tests in
157
158 if List.length isvalid_failed > 0 then begin
159 Printf.printf "\n=== Failing isvalid roundtrip tests (first 20) ===\n";
160 List.iteri (fun i r ->
161 if i < 20 then begin
162 match r.parse_error with
163 | Some err -> Printf.printf "%s: %s\n" r.filename err
164 | None ->
165 Printf.printf "%s: original_valid=%b, roundtrip_valid=%b (errors: %d -> %d)\n"
166 r.filename r.original_valid r.roundtrip_valid
167 r.original_errors r.roundtrip_errors
168 end
169 ) isvalid_failed
170 end;
171
172 (* Print roundtrip failures for all tests *)
173 let roundtrip_failures = List.filter (fun r -> not r.roundtrip_ok) results in
174 if List.length roundtrip_failures > 0 then begin
175 Printf.printf "\n=== Roundtrip failures (first 20) ===\n";
176 List.iteri (fun i r ->
177 if i < 20 then
178 Printf.printf "%s: %s\n" r.filename
179 (Option.value ~default:"unknown error" r.parse_error)
180 ) roundtrip_failures
181 end;
182
183 (* Summary *)
184 Printf.printf "\n=== Roundtrip Test Results ===\n";
185 Printf.printf "isvalid tests: %d/%d passed (roundtripped and still valid)\n"
186 (List.length isvalid_passed) (List.length isvalid_tests);
187 Printf.printf "novalid tests: %d/%d roundtripped successfully\n"
188 (List.length novalid_passed) (List.length novalid_tests);
189 Printf.printf "haswarn tests: %d/%d roundtripped successfully\n"
190 (List.length haswarn_passed) (List.length haswarn_tests);
191
192 let total_roundtrip_ok = List.length (List.filter (fun r -> r.roundtrip_ok) results) in
193 Printf.printf "\nTotal: %d/%d roundtripped without errors\n"
194 total_roundtrip_ok (List.length results);
195 Printf.printf "isvalid preservation: %d/%d still valid after roundtrip\n"
196 (List.length isvalid_passed) (List.length isvalid_tests);
197
198 (* Exit with error if isvalid tests fail validation after roundtrip *)
199 let exit_code = if List.length isvalid_failed > 0 then 1 else 0 in
200 exit exit_code