OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* Roundtrip test: Parse -> Serialize -> Re-parse -> Validate
2
3 This test validates that the HTML5 serializer produces valid HTML5
4 by roundtripping the validator test suite files through:
5 1. Parse with HTML5 parser
6 2. Serialize DOM back to HTML
7 3. Re-parse the serialized HTML
8 4. Validate the result
9
10 For "isvalid" tests: the roundtripped document should still be valid
11 For "novalid/haswarn" tests: we just verify the roundtrip works without crashes
12*)
13
14module Report = Test_report
15
16(* Test result type *)
17type test_result = {
18 filename : string;
19 test_type : string; (* isvalid, novalid, haswarn *)
20 original_valid : bool; (* Did original pass validation? *)
21 roundtrip_valid : bool; (* Did roundtripped doc pass validation? *)
22 roundtrip_ok : bool; (* Did roundtrip work without errors? *)
23 original_errors : int;
24 roundtrip_errors : int;
25 parse_error : string option;
26}
27
28(* Get test type from filename *)
29let get_test_type filename =
30 if Astring.String.is_suffix ~affix:"-isvalid.html" filename ||
31 Astring.String.is_suffix ~affix:"-isvalid.xhtml" filename then "isvalid"
32 else if Astring.String.is_suffix ~affix:"-novalid.html" filename ||
33 Astring.String.is_suffix ~affix:"-novalid.xhtml" filename then "novalid"
34 else if Astring.String.is_suffix ~affix:"-haswarn.html" filename ||
35 Astring.String.is_suffix ~affix:"-haswarn.xhtml" filename then "haswarn"
36 else "unknown"
37
38(* Count errors in validation result *)
39let count_errors messages =
40 List.length (List.filter (fun (m : Htmlrw_check.message) ->
41 m.severity = Htmlrw_check.Error
42 ) messages)
43
44(* Serialize a document to HTML string *)
45let serialize_document doc =
46 Html5rw.Dom.to_html ~pretty:false doc
47
48(* Run roundtrip test on a single file *)
49let test_file path =
50 let filename = Filename.basename path in
51 let test_type = get_test_type filename in
52
53 try
54 (* Read file content *)
55 let content =
56 let ic = open_in path in
57 let n = in_channel_length ic in
58 let s = really_input_string ic n in
59 close_in ic;
60 s
61 in
62
63 (* Parse original *)
64 let original_result = Html5rw.parse_bytes (Bytes.of_string content) in
65 let original_doc = Html5rw.root original_result in
66
67 (* Validate original *)
68 let checker_result = Htmlrw_check.check_parsed ~system_id:path original_result in
69 let original_messages = Htmlrw_check.messages checker_result in
70 let original_errors = count_errors original_messages in
71 let original_valid = original_errors = 0 in
72
73 (* Serialize to HTML *)
74 let serialized = serialize_document original_doc in
75
76 (* Re-parse serialized HTML *)
77 let roundtrip_result = Html5rw.parse_bytes (Bytes.of_string serialized) in
78
79 (* Validate roundtripped document *)
80 let roundtrip_checker = Htmlrw_check.check_parsed ~system_id:path roundtrip_result in
81 let roundtrip_messages = Htmlrw_check.messages roundtrip_checker in
82 let roundtrip_errors = count_errors roundtrip_messages in
83 let roundtrip_valid = roundtrip_errors = 0 in
84
85 {
86 filename;
87 test_type;
88 original_valid;
89 roundtrip_valid;
90 roundtrip_ok = true;
91 original_errors;
92 roundtrip_errors;
93 parse_error = None;
94 }
95 with e ->
96 {
97 filename;
98 test_type;
99 original_valid = false;
100 roundtrip_valid = false;
101 roundtrip_ok = false;
102 original_errors = 0;
103 roundtrip_errors = 0;
104 parse_error = Some (Printexc.to_string e);
105 }
106
107(* Recursively find all test files *)
108let rec find_test_files dir =
109 let files = Sys.readdir dir |> Array.to_list in
110 List.concat_map (fun f ->
111 let path = Filename.concat dir f in
112 if Sys.is_directory path then
113 find_test_files path
114 else if Astring.String.is_suffix ~affix:"-isvalid.html" f ||
115 Astring.String.is_suffix ~affix:"-novalid.html" f ||
116 Astring.String.is_suffix ~affix:"-haswarn.html" f then
117 [path]
118 else
119 []
120 ) files
121
122let () =
123 let test_dir = Sys.argv.(1) in
124
125 Printf.printf "Discovering test files...\n%!";
126 let test_files = find_test_files test_dir in
127 Printf.printf "Found %d test files\n%!" (List.length test_files);
128
129 Printf.printf "Running roundtrip tests...\n%!";
130
131 (* Run tests *)
132 let results = List.map test_file test_files in
133
134 (* Categorize results *)
135 let isvalid_tests = List.filter (fun r -> r.test_type = "isvalid") results in
136 let novalid_tests = List.filter (fun r -> r.test_type = "novalid") results in
137 let haswarn_tests = List.filter (fun r -> r.test_type = "haswarn") results in
138
139 (* For isvalid tests: check that roundtripped document is still valid *)
140 let isvalid_passed = List.filter (fun r ->
141 r.roundtrip_ok && r.roundtrip_valid
142 ) isvalid_tests in
143
144 (* For novalid/haswarn tests: just check roundtrip works *)
145 let novalid_passed = List.filter (fun r -> r.roundtrip_ok) novalid_tests in
146 let haswarn_passed = List.filter (fun r -> r.roundtrip_ok) haswarn_tests in
147
148 (* Print failures for isvalid tests *)
149 let isvalid_failed = List.filter (fun r ->
150 not r.roundtrip_ok || not r.roundtrip_valid
151 ) isvalid_tests in
152
153 if List.length isvalid_failed > 0 then begin
154 Printf.printf "\n=== Failing isvalid roundtrip tests (first 20) ===\n";
155 List.iteri (fun i r ->
156 if i < 20 then begin
157 match r.parse_error with
158 | Some err -> Printf.printf "%s: %s\n" r.filename err
159 | None ->
160 Printf.printf "%s: original_valid=%b, roundtrip_valid=%b (errors: %d -> %d)\n"
161 r.filename r.original_valid r.roundtrip_valid
162 r.original_errors r.roundtrip_errors
163 end
164 ) isvalid_failed
165 end;
166
167 (* Print roundtrip failures for all tests *)
168 let roundtrip_failures = List.filter (fun r -> not r.roundtrip_ok) results in
169 if List.length roundtrip_failures > 0 then begin
170 Printf.printf "\n=== Roundtrip failures (first 20) ===\n";
171 List.iteri (fun i r ->
172 if i < 20 then
173 Printf.printf "%s: %s\n" r.filename
174 (Option.value ~default:"unknown error" r.parse_error)
175 ) roundtrip_failures
176 end;
177
178 (* Summary *)
179 Printf.printf "\n=== Roundtrip Test Results ===\n";
180 Printf.printf "isvalid tests: %d/%d passed (roundtripped and still valid)\n"
181 (List.length isvalid_passed) (List.length isvalid_tests);
182 Printf.printf "novalid tests: %d/%d roundtripped successfully\n"
183 (List.length novalid_passed) (List.length novalid_tests);
184 Printf.printf "haswarn tests: %d/%d roundtripped successfully\n"
185 (List.length haswarn_passed) (List.length haswarn_tests);
186
187 let total_roundtrip_ok = List.length (List.filter (fun r -> r.roundtrip_ok) results) in
188 Printf.printf "\nTotal: %d/%d roundtripped without errors\n"
189 total_roundtrip_ok (List.length results);
190 Printf.printf "isvalid preservation: %d/%d still valid after roundtrip\n"
191 (List.length isvalid_passed) (List.length isvalid_tests);
192
193 (* Exit with error if isvalid tests fail validation after roundtrip *)
194 let exit_code = if List.length isvalid_failed > 0 then 1 else 0 in
195 exit exit_code