OCaml HTML5 parser/serialiser based on Python's JustHTML

vendor validator

-1
.gitignore
··· 23 23 # Opam local switch 24 24 _opam/ 25 25 *.html 26 - validator 27 26 28 27 # Session/planning files 29 28 PLAN*.md
+3
.gitmodules
··· 1 1 [submodule "html5lib-tests"] 2 2 path = html5lib-tests 3 3 url = https://github.com/html5lib/html5lib-tests 4 + [submodule "validator"] 5 + path = validator 6 + url = https://github.com/validator/validator.git
+16 -15
test/dune
··· 55 55 (action 56 56 (run %{exe:test_serializer.exe} ../html5lib-tests/serializer))) 57 57 58 - (executable 59 - (name test_nesting_checker) 60 - (modules test_nesting_checker) 61 - (libraries html5rw.check)) 62 - 63 - (executable 64 - (name test_html5_checker) 65 - (modules test_html5_checker) 66 - (libraries bytesrw html5rw html5rw.check str)) 67 - 68 - (rule 69 - (alias runtest) 70 - (action 71 - (run %{exe:test_html5_checker.exe}))) 72 - 73 58 (library 74 59 (name validator_messages) 75 60 (modules validator_messages) ··· 85 70 (modules test_validator) 86 71 (libraries bytesrw html5rw html5rw.check str jsont jsont.bytesrw test_report validator_messages expected_message unix)) 87 72 73 + (rule 74 + (alias runtest) 75 + (deps 76 + (glob_files_rec ../validator/tests/**/*.html) 77 + (glob_files_rec ../validator/tests/**/*.xhtml) 78 + ../validator/tests/messages.json) 79 + (action 80 + (run %{exe:test_validator.exe} ../validator/tests))) 81 + 88 82 (executable 89 83 (name test_roundtrip) 90 84 (modules test_roundtrip) 91 85 (libraries bytesrw html5rw html5rw.check astring test_report)) 86 + 87 + (rule 88 + (alias runtest) 89 + (deps 90 + (glob_files_rec ../validator/tests/**/*.html)) 91 + (action 92 + (run %{exe:test_roundtrip.exe} ../validator/tests)))
-298
test/test_html5_checker.ml
··· 1 - (** Tests for the html5_checker library *) 2 - 3 - (** Helper to create a reader from a string *) 4 - let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 5 - 6 - (** Helper to check if a message contains a substring *) 7 - let message_contains msg substring = 8 - String.lowercase_ascii msg.Htmlrw_check.text 9 - |> fun s -> String.length s >= String.length substring && 10 - try 11 - ignore (Str.search_forward (Str.regexp_case_fold (Str.quote substring)) s 0); 12 - true 13 - with Not_found -> false 14 - 15 - (** Test that valid HTML5 produces no errors *) 16 - let test_valid_html5 () = 17 - Printf.printf "Test 1: Valid HTML5 document\n"; 18 - let html = {|<!DOCTYPE html> 19 - <html lang="en"> 20 - <head><title>Test</title></head> 21 - <body><p>Hello world</p></body> 22 - </html>|} in 23 - let reader = reader_of_string html in 24 - let result = Htmlrw_check.check reader in 25 - let errors = Htmlrw_check.errors result in 26 - Printf.printf " Found %d error(s)\n" (List.length errors); 27 - if List.length errors > 0 then begin 28 - List.iter (fun msg -> 29 - Printf.printf " - %s\n" msg.Htmlrw_check.text 30 - ) errors; 31 - end else 32 - Printf.printf " OK: No errors as expected\n" 33 - 34 - (** Test that missing DOCTYPE is detected *) 35 - let test_missing_doctype () = 36 - Printf.printf "\nTest 2: Missing DOCTYPE\n"; 37 - let html = "<html><body>Hello</body></html>" in 38 - let reader = reader_of_string html in 39 - let result = Htmlrw_check.check reader in 40 - let errors = Htmlrw_check.errors result in 41 - Printf.printf " Found %d error(s)\n" (List.length errors); 42 - if List.length errors = 0 then 43 - Printf.printf " Warning: Expected parse errors for missing DOCTYPE\n" 44 - else begin 45 - List.iter (fun msg -> 46 - Printf.printf " - %s\n" msg.Htmlrw_check.text 47 - ) errors; 48 - end 49 - 50 - (** Test that obsolete elements are detected *) 51 - let test_obsolete_element () = 52 - Printf.printf "\nTest 3: Obsolete <center> element\n"; 53 - let html = "<!DOCTYPE html><html><body><center>Centered</center></body></html>" in 54 - let reader = reader_of_string html in 55 - let result = Htmlrw_check.check reader in 56 - let all_msgs = Htmlrw_check.messages result in 57 - Printf.printf " Found %d message(s)\n" (List.length all_msgs); 58 - let obsolete_msgs = List.filter (fun m -> 59 - message_contains m "obsolete" || message_contains m "center" 60 - ) all_msgs in 61 - if List.length obsolete_msgs > 0 then begin 62 - Printf.printf " Found obsolete-related messages:\n"; 63 - List.iter (fun msg -> 64 - Printf.printf " - %s\n" msg.Htmlrw_check.text 65 - ) obsolete_msgs; 66 - end else 67 - Printf.printf " Note: No obsolete element warnings found (checker may not be enabled)\n" 68 - 69 - (** Test duplicate IDs *) 70 - let test_duplicate_id () = 71 - Printf.printf "\nTest 4: Duplicate ID attributes\n"; 72 - let html = {|<!DOCTYPE html><html><body> 73 - <div id="foo">First</div> 74 - <div id="foo">Second</div> 75 - </body></html>|} in 76 - let reader = reader_of_string html in 77 - let result = Htmlrw_check.check reader in 78 - let all_msgs = Htmlrw_check.messages result in 79 - Printf.printf " Found %d message(s)\n" (List.length all_msgs); 80 - let id_msgs = List.filter (fun m -> 81 - message_contains m "duplicate" || message_contains m "id" 82 - ) all_msgs in 83 - if List.length id_msgs > 0 then begin 84 - Printf.printf " Found ID-related messages:\n"; 85 - List.iter (fun msg -> 86 - Printf.printf " - %s\n" msg.Htmlrw_check.text 87 - ) id_msgs; 88 - end else 89 - Printf.printf " Note: No duplicate ID errors found (checker may not be enabled)\n" 90 - 91 - (** Test heading structure *) 92 - let test_heading_skip () = 93 - Printf.printf "\nTest 5: Skipped heading level\n"; 94 - let html = {|<!DOCTYPE html><html><body> 95 - <h1>Title</h1> 96 - <h3>Skipped h2</h3> 97 - </body></html>|} in 98 - let reader = reader_of_string html in 99 - let result = Htmlrw_check.check reader in 100 - let all_msgs = Htmlrw_check.messages result in 101 - Printf.printf " Found %d message(s)\n" (List.length all_msgs); 102 - let heading_msgs = List.filter (fun m -> 103 - message_contains m "heading" || message_contains m "skip" 104 - ) all_msgs in 105 - if List.length heading_msgs > 0 then begin 106 - Printf.printf " Found heading-related messages:\n"; 107 - List.iter (fun msg -> 108 - Printf.printf " - %s\n" msg.Htmlrw_check.text 109 - ) heading_msgs; 110 - end else 111 - Printf.printf " Note: No heading structure warnings found (checker may not be enabled)\n" 112 - 113 - (** Test img without alt *) 114 - let test_img_without_alt () = 115 - Printf.printf "\nTest 6: Image without alt attribute\n"; 116 - let html = {|<!DOCTYPE html><html><body> 117 - <img src="test.jpg"> 118 - </body></html>|} in 119 - let reader = reader_of_string html in 120 - let result = Htmlrw_check.check reader in 121 - let all_msgs = Htmlrw_check.messages result in 122 - Printf.printf " Found %d message(s)\n" (List.length all_msgs); 123 - let img_msgs = List.filter (fun m -> 124 - message_contains m "alt" || (message_contains m "img" && message_contains m "attribute") 125 - ) all_msgs in 126 - if List.length img_msgs > 0 then begin 127 - Printf.printf " Found img/alt-related messages:\n"; 128 - List.iter (fun msg -> 129 - Printf.printf " - %s\n" msg.Htmlrw_check.text 130 - ) img_msgs; 131 - end else 132 - Printf.printf " Note: No missing alt attribute errors found (checker may not be enabled)\n" 133 - 134 - (** Test invalid nesting *) 135 - let test_invalid_nesting () = 136 - Printf.printf "\nTest 7: Invalid nesting - <a> inside <a>\n"; 137 - let html = {|<!DOCTYPE html><html><body> 138 - <a href="#">Link <a href="#">Nested</a></a> 139 - </body></html>|} in 140 - let reader = reader_of_string html in 141 - let result = Htmlrw_check.check reader in 142 - let all_msgs = Htmlrw_check.messages result in 143 - Printf.printf " Found %d message(s)\n" (List.length all_msgs); 144 - let nesting_msgs = List.filter (fun m -> 145 - message_contains m "nesting" || message_contains m "nested" || message_contains m "ancestor" 146 - ) all_msgs in 147 - if List.length nesting_msgs > 0 then begin 148 - Printf.printf " Found nesting-related messages:\n"; 149 - List.iter (fun msg -> 150 - Printf.printf " - %s\n" msg.Htmlrw_check.text 151 - ) nesting_msgs; 152 - end else 153 - Printf.printf " Note: No nesting errors found (checker may not be enabled)\n" 154 - 155 - (** Test form inside form *) 156 - let test_form_nesting () = 157 - Printf.printf "\nTest 8: Invalid nesting - <form> inside <form>\n"; 158 - let html = {|<!DOCTYPE html><html><body> 159 - <form><form></form></form> 160 - </body></html>|} in 161 - let reader = reader_of_string html in 162 - let result = Htmlrw_check.check reader in 163 - let all_msgs = Htmlrw_check.messages result in 164 - Printf.printf " Found %d message(s)\n" (List.length all_msgs); 165 - let form_msgs = List.filter (fun m -> 166 - message_contains m "form" 167 - ) all_msgs in 168 - if List.length form_msgs > 0 then begin 169 - Printf.printf " Found form-related messages:\n"; 170 - List.iter (fun msg -> 171 - Printf.printf " - %s\n" msg.Htmlrw_check.text 172 - ) form_msgs; 173 - end else 174 - Printf.printf " Note: No form nesting errors found (checker may not be enabled)\n" 175 - 176 - (** Test output formatting *) 177 - let test_output_formats () = 178 - Printf.printf "\nTest 9: Output format testing\n"; 179 - let html = {|<!DOCTYPE html><html><body><p>Test</p></body></html>|} in 180 - let reader = reader_of_string html in 181 - let result = Htmlrw_check.check reader in 182 - 183 - Printf.printf " Testing text format:\n"; 184 - let text_output = Htmlrw_check.to_text result in 185 - Printf.printf " Length: %d chars\n" (String.length text_output); 186 - 187 - Printf.printf " Testing JSON format:\n"; 188 - let json_output = Htmlrw_check.to_json result in 189 - Printf.printf " Length: %d chars\n" (String.length json_output); 190 - 191 - Printf.printf " Testing GNU format:\n"; 192 - let gnu_output = Htmlrw_check.to_gnu result in 193 - Printf.printf " Length: %d chars\n" (String.length gnu_output) 194 - 195 - (** Test has_errors function *) 196 - let test_has_errors () = 197 - Printf.printf "\nTest 10: has_errors function\n"; 198 - 199 - (* Valid document should have no errors *) 200 - let valid_html = "<!DOCTYPE html><html><body><p>Valid</p></body></html>" in 201 - let result1 = Htmlrw_check.check (reader_of_string valid_html) in 202 - Printf.printf " Valid document has_errors: %b\n" (Htmlrw_check.has_errors result1); 203 - 204 - (* Document with likely parse errors *) 205 - let invalid_html = "<html><body><p>Unclosed" in 206 - let result2 = Htmlrw_check.check (reader_of_string invalid_html) in 207 - Printf.printf " Invalid document has_errors: %b\n" (Htmlrw_check.has_errors result2) 208 - 209 - (** Test check_dom with pre-parsed document *) 210 - let test_check_dom () = 211 - Printf.printf "\nTest 11: check_dom with pre-parsed document\n"; 212 - let html = "<!DOCTYPE html><html><body><p>Test</p></body></html>" in 213 - let reader = reader_of_string html in 214 - let parsed = Html5rw.parse reader in 215 - let result = Htmlrw_check.check_parsed parsed in 216 - let all_msgs = Htmlrw_check.messages result in 217 - Printf.printf " check_dom found %d message(s)\n" (List.length all_msgs); 218 - Printf.printf " OK: check_dom completed successfully\n" 219 - 220 - (** Test system_id parameter *) 221 - let test_system_id () = 222 - Printf.printf "\nTest 12: system_id parameter\n"; 223 - let html = "<!DOCTYPE html><html><body><p>Test</p></body></html>" in 224 - let reader = reader_of_string html in 225 - let result = Htmlrw_check.check ~system_id:"test.html" reader in 226 - match Htmlrw_check.system_id result with 227 - | Some id -> Printf.printf " system_id: %s\n" id 228 - | None -> Printf.printf " Warning: system_id not set\n" 229 - 230 - (** Test collect_parse_errors flag *) 231 - let test_collect_parse_errors_flag () = 232 - Printf.printf "\nTest 13: collect_parse_errors flag\n"; 233 - let html = "<!DOCTYPE html><html><body><p>Test</p></body></html>" in 234 - 235 - let result_with = Htmlrw_check.check ~collect_parse_errors:true (reader_of_string html) in 236 - let msgs_with = Htmlrw_check.messages result_with in 237 - Printf.printf " With parse errors: %d message(s)\n" (List.length msgs_with); 238 - 239 - let result_without = Htmlrw_check.check ~collect_parse_errors:false (reader_of_string html) in 240 - let msgs_without = Htmlrw_check.messages result_without in 241 - Printf.printf " Without parse errors: %d message(s)\n" (List.length msgs_without) 242 - 243 - (** Test document accessor *) 244 - let test_document_accessor () = 245 - Printf.printf "\nTest 14: document accessor\n"; 246 - let html = "<!DOCTYPE html><html><body><p>Test</p></body></html>" in 247 - let reader = reader_of_string html in 248 - let result = Htmlrw_check.check reader in 249 - let _doc = Htmlrw_check.document result in 250 - Printf.printf " OK: document accessor works\n" 251 - 252 - (** Test message severity filtering *) 253 - let test_severity_filtering () = 254 - Printf.printf "\nTest 15: Message severity filtering\n"; 255 - let html = "<!DOCTYPE html><html><body><p>Test</p></body></html>" in 256 - let reader = reader_of_string html in 257 - let result = Htmlrw_check.check reader in 258 - 259 - let all_msgs = Htmlrw_check.messages result in 260 - let errors = Htmlrw_check.errors result in 261 - let warnings = Htmlrw_check.warnings result in 262 - 263 - Printf.printf " Total messages: %d\n" (List.length all_msgs); 264 - Printf.printf " Errors: %d\n" (List.length errors); 265 - Printf.printf " Warnings: %d\n" (List.length warnings); 266 - 267 - (* Verify that errors + warnings <= all messages *) 268 - if List.length errors + List.length warnings <= List.length all_msgs then 269 - Printf.printf " OK: Message counts are consistent\n" 270 - else 271 - Printf.printf " Warning: Message counts inconsistent\n" 272 - 273 - (** Run all tests *) 274 - let () = 275 - Printf.printf "Running html5_checker tests...\n"; 276 - Printf.printf "========================================\n\n"; 277 - 278 - test_valid_html5 (); 279 - test_missing_doctype (); 280 - test_obsolete_element (); 281 - test_duplicate_id (); 282 - test_heading_skip (); 283 - test_img_without_alt (); 284 - test_invalid_nesting (); 285 - test_form_nesting (); 286 - test_output_formats (); 287 - test_has_errors (); 288 - test_check_dom (); 289 - test_system_id (); 290 - test_collect_parse_errors_flag (); 291 - test_document_accessor (); 292 - test_severity_filtering (); 293 - 294 - Printf.printf "\n========================================\n"; 295 - Printf.printf "All tests completed!\n"; 296 - Printf.printf "\nNote: Some checkers may not be enabled yet.\n"; 297 - Printf.printf "Tests marked with 'Note:' indicate features that may be\n"; 298 - Printf.printf "implemented in future versions.\n"
-68
test/test_nesting_checker.ml
··· 1 - (** Test for nesting checker functionality via public API *) 2 - 3 - let check_html html = 4 - let reader = Bytesrw.Bytes.Reader.of_string html in 5 - Htmlrw_check.check reader 6 - 7 - let () = 8 - (* Test 1: <a> cannot contain another <a> *) 9 - Printf.printf "Test 1: Checking <a href> inside <a href>\n"; 10 - let result1 = check_html "<a href='#'><a href='#'>nested</a></a>" in 11 - let errors1 = Htmlrw_check.errors result1 in 12 - Printf.printf " Found %d error(s)\n" (List.length errors1); 13 - List.iter (fun msg -> 14 - Printf.printf " - %s\n" msg.Htmlrw_check.text 15 - ) errors1; 16 - 17 - (* Test 2: <button> inside <a> *) 18 - Printf.printf "\nTest 2: Checking <button> inside <a href>\n"; 19 - let result2 = check_html "<a href='#'><button>click</button></a>" in 20 - let errors2 = Htmlrw_check.errors result2 in 21 - Printf.printf " Found %d error(s)\n" (List.length errors2); 22 - List.iter (fun msg -> 23 - Printf.printf " - %s\n" msg.Htmlrw_check.text 24 - ) errors2; 25 - 26 - (* Test 3: form inside form *) 27 - Printf.printf "\nTest 3: Checking <form> inside <form>\n"; 28 - let result3 = check_html "<form><form>nested</form></form>" in 29 - let errors3 = Htmlrw_check.errors result3 in 30 - Printf.printf " Found %d error(s)\n" (List.length errors3); 31 - List.iter (fun msg -> 32 - Printf.printf " - %s\n" msg.Htmlrw_check.text 33 - ) errors3; 34 - 35 - (* Test 4: header inside footer (should be allowed) *) 36 - Printf.printf "\nTest 4: Checking <header> inside <footer>\n"; 37 - let result4 = check_html "<footer><header>test</header></footer>" in 38 - let errors4 = Htmlrw_check.errors result4 in 39 - Printf.printf " Found %d error(s)\n" (List.length errors4); 40 - if List.length errors4 > 0 then 41 - List.iter (fun msg -> 42 - Printf.printf " - %s\n" msg.Htmlrw_check.text 43 - ) errors4 44 - else 45 - Printf.printf " OK: No errors (header inside footer is valid)\n"; 46 - 47 - (* Test 5: input inside button *) 48 - Printf.printf "\nTest 5: Checking <input type=text> inside <button>\n"; 49 - let result5 = check_html "<button><input type='text'></button>" in 50 - let errors5 = Htmlrw_check.errors result5 in 51 - Printf.printf " Found %d error(s)\n" (List.length errors5); 52 - List.iter (fun msg -> 53 - Printf.printf " - %s\n" msg.Htmlrw_check.text 54 - ) errors5; 55 - 56 - (* Test 6: valid nesting - should not error *) 57 - Printf.printf "\nTest 6: Checking valid nesting: <div> inside <div>\n"; 58 - let result6 = check_html "<div><div>nested</div></div>" in 59 - let errors6 = Htmlrw_check.errors result6 in 60 - Printf.printf " Found %d error(s)\n" (List.length errors6); 61 - if List.length errors6 = 0 then 62 - Printf.printf " OK: No errors as expected\n" 63 - else 64 - List.iter (fun msg -> 65 - Printf.printf " - %s\n" msg.Htmlrw_check.text 66 - ) errors6; 67 - 68 - Printf.printf "\nAll tests completed!\n"