OCaml HTML5 parser/serialiser based on Python's JustHTML

Add comprehensive fuzzing test suite

Implements 9 specialized fuzzers to exercise different aspects of the HTML parser:

- fuzz_properties.ml: Property-based testing (8 invariants including parse
stability, output bounds, idempotence)
- fuzz_structure.ml: Structure-aware HTML generation with mutation strategies
- fuzz_exhaustion.ml: Resource exhaustion tests for DoS vectors (deep nesting,
wide trees, huge text, many attributes, etc.)
- fuzz_error_recovery.ml: Error recovery tests (12 categories including
unclosed tags, misnested elements, invalid attributes)
- fuzz_serializer.ml: Serializer-specific tests (attributes, void elements,
raw text, whitespace, entities, foreign content)
- fuzz_streaming.ml: Parsing determinism and roundtrip stability tests
- fuzz_encoding.ml: UTF-8 handling, BOM, surrogates, control characters
- fuzz_fragment.ml: Fragment parsing with various context elements
- fuzz_security.ml: mXSS stability and XSS vector handling

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

+46
fuzz/dune
··· 5 5 (executable 6 6 (name fuzz_afl) 7 7 (libraries bytesrw html5rw afl-persistent)) 8 + 8 9 (executable (name test_crash) (libraries bytesrw html5rw)) 9 10 (executable (name test_pre) (libraries bytesrw html5rw)) 11 + 12 + ; Property-based testing (AFL) 13 + (executable 14 + (name fuzz_properties) 15 + (libraries bytesrw html5rw str afl-persistent)) 16 + 17 + ; Structure-aware fuzzer (AFL) 18 + (executable 19 + (name fuzz_structure) 20 + (libraries bytesrw html5rw str)) 21 + 22 + ; Resource exhaustion tests 23 + (executable 24 + (name fuzz_exhaustion) 25 + (libraries bytesrw html5rw unix)) 26 + 27 + ; Error recovery tests 28 + (executable 29 + (name fuzz_error_recovery) 30 + (libraries bytesrw html5rw)) 31 + 32 + ; Serializer-specific tests 33 + (executable 34 + (name fuzz_serializer) 35 + (libraries bytesrw html5rw str)) 36 + 37 + ; Streaming/incremental tests 38 + (executable 39 + (name fuzz_streaming) 40 + (libraries bytesrw html5rw)) 41 + 42 + ; Encoding tests 43 + (executable 44 + (name fuzz_encoding) 45 + (libraries bytesrw html5rw)) 46 + 47 + ; Fragment parsing tests 48 + (executable 49 + (name fuzz_fragment) 50 + (libraries bytesrw html5rw)) 51 + 52 + ; Security/sanitizer tests 53 + (executable 54 + (name fuzz_security) 55 + (libraries bytesrw html5rw str))
+260
fuzz/fuzz_encoding.ml
··· 1 + (* Encoding fuzzer for HTML5rw 2 + Tests UTF-8 handling, BOM, surrogates, and encoding edge cases *) 3 + 4 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 5 + 6 + (* Test helper: parse and serialize, check for crashes *) 7 + let test_encoding input = 8 + try 9 + let r = Html5rw.parse (reader_of_string input) in 10 + let _ = Html5rw.to_string ~pretty:false r in 11 + true 12 + with _ -> 13 + false 14 + 15 + (* Test helper: check roundtrip stability *) 16 + let test_roundtrip input = 17 + try 18 + let r1 = Html5rw.parse (reader_of_string input) in 19 + let s1 = Html5rw.to_string ~pretty:false r1 in 20 + let r2 = Html5rw.parse (reader_of_string s1) in 21 + let s2 = Html5rw.to_string ~pretty:false r2 in 22 + let r3 = Html5rw.parse (reader_of_string s2) in 23 + let s3 = Html5rw.to_string ~pretty:false r3 in 24 + s2 = s3 25 + with _ -> false 26 + 27 + (* UTF-8 BOM *) 28 + let bom_cases = [| 29 + "\xEF\xBB\xBF"; (* Just BOM *) 30 + "\xEF\xBB\xBF<!DOCTYPE html>"; (* BOM + DOCTYPE *) 31 + "\xEF\xBB\xBF<div>test</div>"; (* BOM + content *) 32 + "\xEF\xBB\xBF\xEF\xBB\xBF"; (* Double BOM *) 33 + "<div>\xEF\xBB\xBF</div>"; (* BOM in content - should be preserved as text *) 34 + |] 35 + 36 + (* Valid UTF-8 sequences *) 37 + let valid_utf8_cases = [| 38 + (* 1-byte (ASCII) *) 39 + "hello"; 40 + "<div>test</div>"; 41 + 42 + (* 2-byte sequences *) 43 + "\xC2\xA0"; (* NBSP *) 44 + "\xC3\xA9"; (* é *) 45 + "caf\xC3\xA9"; (* café *) 46 + "\xC2\xAB\xC2\xBB"; (* « » *) 47 + 48 + (* 3-byte sequences *) 49 + "\xE2\x80\x93"; (* en-dash *) 50 + "\xE2\x80\x94"; (* em-dash *) 51 + "\xE2\x80\x99"; (* right single quote *) 52 + "\xE2\x80\x9C\xE2\x80\x9D"; (* curly quotes *) 53 + "\xE4\xB8\xAD\xE6\x96\x87"; (* 中文 *) 54 + "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"; (* 日本語 *) 55 + 56 + (* 4-byte sequences (emoji, etc.) *) 57 + "\xF0\x9F\x98\x80"; (* 😀 *) 58 + "\xF0\x9F\x8E\x89"; (* 🎉 *) 59 + "\xF0\x9D\x94\xB8"; (* 𝔸 mathematical double-struck *) 60 + 61 + (* Mixed *) 62 + "<div>\xC3\xA9\xE2\x80\x93\xF0\x9F\x98\x80</div>"; 63 + "<span title=\"\xC3\xA9\">text</span>"; 64 + |] 65 + 66 + (* Invalid UTF-8 sequences (should be handled gracefully) *) 67 + let invalid_utf8_cases = [| 68 + (* Lone continuation bytes *) 69 + "\x80"; 70 + "\xBF"; 71 + "\x80\x80\x80"; 72 + 73 + (* Overlong sequences *) 74 + "\xC0\x80"; (* Overlong NUL *) 75 + "\xE0\x80\x80"; (* Overlong NUL (3-byte) *) 76 + "\xF0\x80\x80\x80"; (* Overlong NUL (4-byte) *) 77 + "\xC0\xAF"; (* Overlong / *) 78 + "\xC1\xBF"; (* Overlong DEL *) 79 + 80 + (* Truncated sequences *) 81 + "\xC2"; (* Start of 2-byte, missing continuation *) 82 + "\xE0\x80"; (* Start of 3-byte, missing continuation *) 83 + "\xF0\x80\x80"; (* Start of 4-byte, missing continuation *) 84 + 85 + (* Invalid start bytes *) 86 + "\xFE"; 87 + "\xFF"; 88 + "\xFE\xFF"; (* UTF-16 BE BOM as bytes *) 89 + "\xFF\xFE"; (* UTF-16 LE BOM as bytes *) 90 + 91 + (* Surrogate pairs (invalid in UTF-8) *) 92 + "\xED\xA0\x80"; (* High surrogate U+D800 *) 93 + "\xED\xBF\xBF"; (* Low surrogate U+DFFF *) 94 + "\xED\xA0\x80\xED\xB0\x80"; (* Surrogate pair (should be single 4-byte) *) 95 + 96 + (* Out of range *) 97 + "\xF4\x90\x80\x80"; (* U+110000, beyond Unicode *) 98 + "\xF7\xBF\xBF\xBF"; (* U+1FFFFF, way beyond *) 99 + 100 + (* Invalid sequence in tag *) 101 + "<div\x80>"; 102 + "<div class=\"\x80\">"; 103 + "</\x80div>"; 104 + 105 + (* Invalid in attribute value *) 106 + "<div data-x=\"\xC0\xAF\">"; 107 + "<div title=\"\xED\xA0\x80\">"; 108 + |] 109 + 110 + (* Control characters *) 111 + let control_char_cases = [| 112 + (* NUL *) 113 + "\x00"; 114 + "<div>\x00</div>"; 115 + "<div attr=\"\x00\">"; 116 + 117 + (* Other C0 controls *) 118 + "\x01\x02\x03"; 119 + "<div>\x08</div>"; (* backspace *) 120 + "<div>\x0B</div>"; (* vertical tab *) 121 + "<div>\x0C</div>"; (* form feed *) 122 + 123 + (* C1 controls (as UTF-8) *) 124 + "\xC2\x80"; (* U+0080 *) 125 + "\xC2\x9F"; (* U+009F *) 126 + 127 + (* DEL *) 128 + "\x7F"; 129 + 130 + (* Mixed with valid content *) 131 + "<div>hello\x00world</div>"; 132 + "<div>test\x01\x02\x03</div>"; 133 + |] 134 + 135 + (* Unicode special characters *) 136 + let special_unicode_cases = [| 137 + (* Zero-width characters *) 138 + "\xE2\x80\x8B"; (* ZWSP U+200B *) 139 + "\xE2\x80\x8C"; (* ZWNJ U+200C *) 140 + "\xE2\x80\x8D"; (* ZWJ U+200D *) 141 + "\xEF\xBB\xBF"; (* BOM/ZWNBSP U+FEFF *) 142 + 143 + (* Replacement character *) 144 + "\xEF\xBF\xBD"; (* U+FFFD *) 145 + 146 + (* Byte order marks and special noncharacters *) 147 + "\xEF\xBF\xBE"; (* U+FFFE - noncharacter *) 148 + "\xEF\xBF\xBF"; (* U+FFFF - noncharacter *) 149 + 150 + (* Private use area *) 151 + "\xEE\x80\x80"; (* U+E000 *) 152 + "\xEF\xA3\xBF"; (* U+F8FF *) 153 + 154 + (* RTL and BiDi *) 155 + "\xE2\x80\x8F"; (* RLM U+200F *) 156 + "\xE2\x80\xAE"; (* RLO U+202E *) 157 + "\xE2\x80\xAC"; (* PDF U+202C *) 158 + 159 + (* Combining characters *) 160 + "e\xCC\x81"; (* e + combining acute = é *) 161 + "a\xCC\x80\xCC\x81\xCC\x82"; (* Multiple combining marks *) 162 + 163 + (* In HTML context *) 164 + "<div>\xE2\x80\x8B</div>"; 165 + "<span title=\"\xE2\x80\x8F\">"; 166 + |] 167 + 168 + (* Numeric character references *) 169 + let ncr_cases = [| 170 + (* Valid decimal *) 171 + "&#65;"; (* A *) 172 + "&#169;"; (* © *) 173 + "&#8212;"; (* em-dash *) 174 + "&#128512;"; (* 😀 *) 175 + 176 + (* Valid hex *) 177 + "&#x41;"; 178 + "&#xA9;"; 179 + "&#x2014;"; 180 + "&#x1F600;"; 181 + 182 + (* Edge cases *) 183 + "&#0;"; (* NUL - should become replacement *) 184 + "&#x0;"; 185 + "&#127;"; (* DEL *) 186 + "&#128;"; (* C1 control *) 187 + "&#159;"; (* Last C1 control *) 188 + 189 + (* Surrogates (should be replaced) *) 190 + "&#xD800;"; 191 + "&#xDFFF;"; 192 + "&#55296;"; (* D800 decimal *) 193 + 194 + (* Noncharacters *) 195 + "&#xFFFE;"; 196 + "&#xFFFF;"; 197 + "&#x1FFFE;"; 198 + 199 + (* Beyond Unicode *) 200 + "&#x110000;"; 201 + "&#1114112;"; 202 + 203 + (* Very large numbers *) 204 + "&#999999999;"; 205 + "&#xFFFFFFFFF;"; 206 + 207 + (* Invalid formats *) 208 + "&#;"; 209 + "&#x;"; 210 + "&#xGHI;"; 211 + "&#abc;"; 212 + |] 213 + 214 + let run_test_category name cases test_fn = 215 + let passed = ref 0 in 216 + let failed = ref 0 in 217 + Array.iter (fun input -> 218 + if test_fn input then 219 + incr passed 220 + else begin 221 + Printf.printf " FAIL: %s\n" 222 + (String.escaped (String.sub input 0 (min 40 (String.length input)))); 223 + incr failed 224 + end 225 + ) cases; 226 + Printf.printf "%s: %d/%d\n" name !passed (Array.length cases); 227 + !failed = 0 228 + 229 + let () = 230 + Printf.printf "=== Encoding Tests ===\n\n"; 231 + 232 + let all_pass = ref true in 233 + 234 + (* Test basic handling (no exceptions) *) 235 + Printf.printf "--- Crash resistance tests ---\n"; 236 + if not (run_test_category "BOM handling" bom_cases test_encoding) then 237 + all_pass := false; 238 + if not (run_test_category "Valid UTF-8" valid_utf8_cases test_encoding) then 239 + all_pass := false; 240 + if not (run_test_category "Invalid UTF-8" invalid_utf8_cases test_encoding) then 241 + all_pass := false; 242 + if not (run_test_category "Control characters" control_char_cases test_encoding) then 243 + all_pass := false; 244 + if not (run_test_category "Special Unicode" special_unicode_cases test_encoding) then 245 + all_pass := false; 246 + if not (run_test_category "Numeric character refs" ncr_cases test_encoding) then 247 + all_pass := false; 248 + 249 + (* Test roundtrip stability for valid cases *) 250 + Printf.printf "\n--- Roundtrip stability tests ---\n"; 251 + if not (run_test_category "Valid UTF-8 roundtrip" valid_utf8_cases test_roundtrip) then 252 + all_pass := false; 253 + 254 + Printf.printf "\n=== Summary ===\n"; 255 + if !all_pass then 256 + Printf.printf "All encoding tests passed!\n" 257 + else begin 258 + Printf.printf "Some encoding tests failed!\n"; 259 + exit 1 260 + end
+224
fuzz/fuzz_error_recovery.ml
··· 1 + (* Error recovery fuzzer for HTML5rw 2 + Tests the parser's ability to handle and recover from various error conditions *) 3 + 4 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 5 + 6 + (* Category 1: Unclosed tags *) 7 + let unclosed_tag_cases = [| 8 + "<div>"; 9 + "<div><span>"; 10 + "<div><div><div>"; 11 + "<p><p><p>"; 12 + "<a href='x'><b><i>"; 13 + "<table><tr><td>"; 14 + "<ul><li><li><li>"; 15 + "<select><option><option>"; 16 + "<dl><dt><dd>"; 17 + |] 18 + 19 + (* Category 2: Misnested tags *) 20 + let misnested_cases = [| 21 + "<a><b></a></b>"; 22 + "<b><i></b></i>"; 23 + "<p><div></p></div>"; 24 + "<a href='x'><div></a></div>"; 25 + "<em><strong></em></strong>"; 26 + "<b><i><u></b></i></u>"; 27 + "<table><div></table></div>"; 28 + "<span><div></span></div>"; 29 + |] 30 + 31 + (* Category 3: Invalid tag names *) 32 + let invalid_tag_cases = [| 33 + "<123>"; 34 + "<!>"; 35 + "<>"; 36 + "< >"; 37 + "<\t>"; 38 + "<\n>"; 39 + "<%>"; 40 + "<&>"; 41 + "<div<>"; 42 + "<a<b>"; 43 + |] 44 + 45 + (* Category 4: Invalid attributes *) 46 + let invalid_attr_cases = [| 47 + "<div =value>"; 48 + "<div 123=value>"; 49 + "<div a=b=c>"; 50 + "<div a='b\"c>"; 51 + "<div a=\"b'c>"; 52 + "<div a=<b>"; 53 + "<div a=>>"; 54 + "<div a b c>"; 55 + "<div onclick=\"<script>\">"; 56 + |] 57 + 58 + (* Category 5: Premature EOF *) 59 + let premature_eof_cases = [| 60 + "<div"; 61 + "<div attr"; 62 + "<div attr="; 63 + "<div attr='val"; 64 + "<div attr=\"val"; 65 + "<!DOCTYPE"; 66 + "<!DOCTYPE html"; 67 + "<!--"; 68 + "<!-- comment"; 69 + "<![CDATA["; 70 + "<script>alert("; 71 + "<style>.foo {"; 72 + |] 73 + 74 + (* Category 6: Invalid nesting (semantic) *) 75 + let semantic_nesting_cases = [| 76 + "<a><a></a></a>"; 77 + "<form><form></form></form>"; 78 + "<button><button></button></button>"; 79 + "<p><p></p></p>"; 80 + "<li><li></li></li>"; 81 + "<dt><dt></dt></dt>"; 82 + "<table><table></table></table>"; 83 + "<select><select></select></select>"; 84 + |] 85 + 86 + (* Category 7: Foreign content errors *) 87 + let foreign_content_cases = [| 88 + "<svg><div></svg>"; 89 + "<math><span></math>"; 90 + "<svg><foreignObject><svg></foreignObject></svg>"; 91 + "<svg><title><svg></title></svg>"; 92 + "<math><annotation-xml><math></annotation-xml></math>"; 93 + "<svg><script>x</svg>"; 94 + "<svg><style>.x{}</svg>"; 95 + |] 96 + 97 + (* Category 8: DOCTYPE errors *) 98 + let doctype_cases = [| 99 + "<!DOCTYPE>"; 100 + "<!DOCTYPE html PUBLIC>"; 101 + "<!DOCTYPE html SYSTEM>"; 102 + "<!DOCTYPE html PUBLIC \"\" \"\">"; 103 + "<!doctypehtml>"; 104 + "<!DOCTYPEhtml>"; 105 + "<!doctype\nhtml>"; 106 + "<!doctype\thtml>"; 107 + "<!doctype html><!doctype html>"; 108 + |] 109 + 110 + (* Category 9: Comment errors *) 111 + let comment_cases = [| 112 + "<!---->"; 113 + "<!--->"; 114 + "<!--->"; 115 + "<!--a--!>"; 116 + "<!--a--!-->"; 117 + "<!----!>"; 118 + "<!--<script>-->"; 119 + "<!--<!---->"; 120 + "<!->"; 121 + |] 122 + 123 + (* Category 10: Entity errors *) 124 + let entity_cases = [| 125 + "&;"; 126 + "&xyz;"; 127 + "&amp"; 128 + "&#;"; 129 + "&#x;"; 130 + "&#xGHI;"; 131 + "&#99999999999;"; 132 + "&nosuchentity;"; 133 + "&#xFFFFFFFF;"; 134 + "&#0;"; 135 + |] 136 + 137 + (* Category 11: Script/style content *) 138 + let rawtext_cases = [| 139 + "<script></script></script>"; 140 + "<script><!--</script>--></script>"; 141 + "<script><![CDATA[</script>]]></script>"; 142 + "<style></style></style>"; 143 + "<textarea></textarea></textarea>"; 144 + "<title></title></title>"; 145 + "<script>x</script"; 146 + "<xmp></xmp></xmp>"; 147 + |] 148 + 149 + (* Category 12: Table structure errors *) 150 + let table_cases = [| 151 + "<table><div>x</div></table>"; 152 + "<table><tr></table>"; 153 + "<table><td></table>"; 154 + "<table><caption><table></caption></table>"; 155 + "<table><tbody><td></table>"; 156 + "<table><thead><th></table>"; 157 + "<table><col></table>"; 158 + "<table><tr><th><td></table>"; 159 + "<tr><td>x</td></tr>"; 160 + |] 161 + 162 + let test_error_recovery input category = 163 + try 164 + let r1 = Html5rw.parse (reader_of_string input) in 165 + let s1 = Html5rw.to_string ~pretty:false r1 in 166 + let r2 = Html5rw.parse (reader_of_string s1) in 167 + let s2 = Html5rw.to_string ~pretty:false r2 in 168 + let r3 = Html5rw.parse (reader_of_string s2) in 169 + let s3 = Html5rw.to_string ~pretty:false r3 in 170 + if s2 <> s3 then begin 171 + Printf.printf "ROUNDTRIP UNSTABLE [%s]:\n" category; 172 + Printf.printf " Input: %s\n" (String.escaped input); 173 + Printf.printf " S2: %s\n" (String.escaped (String.sub s2 0 (min 100 (String.length s2)))); 174 + Printf.printf " S3: %s\n" (String.escaped (String.sub s3 0 (min 100 (String.length s3)))); 175 + false 176 + end else 177 + true 178 + with e -> 179 + Printf.printf "EXCEPTION [%s]: %s\n" category (Printexc.to_string e); 180 + Printf.printf " Input: %s\n" (String.escaped input); 181 + false 182 + 183 + let run_category name cases = 184 + let passed = ref 0 in 185 + let failed = ref 0 in 186 + Array.iter (fun input -> 187 + if test_error_recovery input name then 188 + incr passed 189 + else 190 + incr failed 191 + ) cases; 192 + Printf.printf "%s: %d/%d passed\n" name !passed (Array.length cases); 193 + !failed = 0 194 + 195 + let () = 196 + let all_pass = ref true in 197 + 198 + let categories = [ 199 + ("unclosed_tags", unclosed_tag_cases); 200 + ("misnested", misnested_cases); 201 + ("invalid_tags", invalid_tag_cases); 202 + ("invalid_attrs", invalid_attr_cases); 203 + ("premature_eof", premature_eof_cases); 204 + ("semantic_nesting", semantic_nesting_cases); 205 + ("foreign_content", foreign_content_cases); 206 + ("doctype", doctype_cases); 207 + ("comments", comment_cases); 208 + ("entities", entity_cases); 209 + ("rawtext", rawtext_cases); 210 + ("tables", table_cases); 211 + ] in 212 + 213 + List.iter (fun (name, cases) -> 214 + if not (run_category name cases) then 215 + all_pass := false 216 + ) categories; 217 + 218 + Printf.printf "\n=== Summary ===\n"; 219 + if !all_pass then 220 + Printf.printf "All error recovery tests passed!\n" 221 + else begin 222 + Printf.printf "Some error recovery tests failed!\n"; 223 + exit 1 224 + end
+256
fuzz/fuzz_exhaustion.ml
··· 1 + (* Resource exhaustion tests for HTML5rw 2 + Tests for algorithmic complexity bugs, memory issues, and DoS vectors *) 3 + 4 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 5 + 6 + (* Timing helper *) 7 + let time_it f = 8 + let start = Unix.gettimeofday () in 9 + let result = f () in 10 + let elapsed = Unix.gettimeofday () -. start in 11 + (result, elapsed) 12 + 13 + (* Test 1: Deeply nested elements *) 14 + (* Note: Deep nesting can exhibit O(n²) complexity in tree construction. 15 + The timing thresholds are set to catch severe regressions while allowing 16 + for some expected slowdown with very deep nesting. *) 17 + let test_deep_nesting depth = 18 + let input = String.concat "" (List.init depth (fun _ -> "<div>")) in 19 + let (_, elapsed) = time_it (fun () -> 20 + try 21 + let r = Html5rw.parse (reader_of_string input) in 22 + let _ = Html5rw.to_string r in 23 + true 24 + with _ -> false 25 + ) in 26 + (* Allow quadratic behavior up to a reasonable limit for very deep nesting. 27 + HTML5 spec allows implementations to impose nesting limits for DoS protection. *) 28 + let max_time = 29 + if depth <= 1000 then float depth *. 0.001 +. 1.0 30 + else float depth *. 0.02 +. 30.0 (* Very lenient for extreme depth - known O(n²) case *) 31 + in 32 + if elapsed > max_time then begin 33 + Printf.printf "SLOW: deep_nesting(%d) took %.3fs (max %.3fs)\n" depth elapsed max_time; 34 + false 35 + end else 36 + true 37 + 38 + (* Test 2: Wide trees (many siblings) *) 39 + let test_wide_tree width = 40 + let children = String.concat "" (List.init width (fun i -> Printf.sprintf "<span>%d</span>" i)) in 41 + let input = "<div>" ^ children ^ "</div>" in 42 + let (_, elapsed) = time_it (fun () -> 43 + try 44 + let r = Html5rw.parse (reader_of_string input) in 45 + let _ = Html5rw.to_string r in 46 + true 47 + with _ -> false 48 + ) in 49 + let max_time = float width *. 0.0001 +. 0.5 in 50 + if elapsed > max_time then begin 51 + Printf.printf "SLOW: wide_tree(%d) took %.3fs (max %.3fs)\n" width elapsed max_time; 52 + false 53 + end else 54 + true 55 + 56 + (* Test 3: Huge text nodes *) 57 + let test_huge_text size = 58 + let text = String.make size 'x' in 59 + let input = "<div>" ^ text ^ "</div>" in 60 + let (_, elapsed) = time_it (fun () -> 61 + try 62 + let r = Html5rw.parse (reader_of_string input) in 63 + let _ = Html5rw.to_string r in 64 + true 65 + with _ -> false 66 + ) in 67 + let max_time = float size *. 0.00001 +. 0.5 in 68 + if elapsed > max_time then begin 69 + Printf.printf "SLOW: huge_text(%d) took %.3fs (max %.3fs)\n" size elapsed max_time; 70 + false 71 + end else 72 + true 73 + 74 + (* Test 4: Many attributes *) 75 + let test_many_attrs count = 76 + let attrs = String.concat " " (List.init count (fun i -> Printf.sprintf "a%d=\"v%d\"" i i)) in 77 + let input = Printf.sprintf "<div %s></div>" attrs in 78 + let (_, elapsed) = time_it (fun () -> 79 + try 80 + let r = Html5rw.parse (reader_of_string input) in 81 + let _ = Html5rw.to_string r in 82 + true 83 + with _ -> false 84 + ) in 85 + let max_time = float count *. 0.0001 +. 0.5 in 86 + if elapsed > max_time then begin 87 + Printf.printf "SLOW: many_attrs(%d) took %.3fs (max %.3fs)\n" count elapsed max_time; 88 + false 89 + end else 90 + true 91 + 92 + (* Test 5: Huge attribute values *) 93 + let test_huge_attr_value size = 94 + let value = String.make size 'x' in 95 + let input = Printf.sprintf "<div data-x=\"%s\"></div>" value in 96 + let (_, elapsed) = time_it (fun () -> 97 + try 98 + let r = Html5rw.parse (reader_of_string input) in 99 + let _ = Html5rw.to_string r in 100 + true 101 + with _ -> false 102 + ) in 103 + let max_time = float size *. 0.00001 +. 0.5 in 104 + if elapsed > max_time then begin 105 + Printf.printf "SLOW: huge_attr_value(%d) took %.3fs (max %.3fs)\n" size elapsed max_time; 106 + false 107 + end else 108 + true 109 + 110 + (* Test 6: Repeated unclosed p tags (adoption agency stress test) *) 111 + let test_repeated_p count = 112 + let input = String.concat "" (List.init count (fun _ -> "<p>")) in 113 + let (_, elapsed) = time_it (fun () -> 114 + try 115 + let r = Html5rw.parse (reader_of_string input) in 116 + let _ = Html5rw.to_string r in 117 + true 118 + with _ -> false 119 + ) in 120 + (* This could trigger O(n^2) behavior in naive implementations *) 121 + let max_time = float count *. 0.001 +. 1.0 in 122 + if elapsed > max_time then begin 123 + Printf.printf "SLOW: repeated_p(%d) took %.3fs (max %.3fs)\n" count elapsed max_time; 124 + false 125 + end else 126 + true 127 + 128 + (* Test 7: Nested formatting elements (adoption agency stress) *) 129 + let test_nested_formatting depth = 130 + let tags = [| "a"; "b"; "i"; "em"; "strong" |] in 131 + let open_tags = String.concat "" (List.init depth (fun i -> "<" ^ tags.(i mod 5) ^ ">")) in 132 + let input = open_tags ^ "text" in 133 + let (_, elapsed) = time_it (fun () -> 134 + try 135 + let r = Html5rw.parse (reader_of_string input) in 136 + let _ = Html5rw.to_string r in 137 + true 138 + with _ -> false 139 + ) in 140 + let max_time = float depth *. 0.001 +. 0.5 in 141 + if elapsed > max_time then begin 142 + Printf.printf "SLOW: nested_formatting(%d) took %.3fs (max %.3fs)\n" depth elapsed max_time; 143 + false 144 + end else 145 + true 146 + 147 + (* Test 8: Table with many cells *) 148 + let test_large_table rows cols = 149 + let cells = String.concat "" (List.init cols (fun _ -> "<td>x</td>")) in 150 + let row = "<tr>" ^ cells ^ "</tr>" in 151 + let tbody = String.concat "" (List.init rows (fun _ -> row)) in 152 + let input = "<table><tbody>" ^ tbody ^ "</tbody></table>" in 153 + let (_, elapsed) = time_it (fun () -> 154 + try 155 + let r = Html5rw.parse (reader_of_string input) in 156 + let _ = Html5rw.to_string r in 157 + true 158 + with _ -> false 159 + ) in 160 + let total = rows * cols in 161 + let max_time = float total *. 0.0001 +. 1.0 in 162 + if elapsed > max_time then begin 163 + Printf.printf "SLOW: large_table(%dx%d) took %.3fs (max %.3fs)\n" rows cols elapsed max_time; 164 + false 165 + end else 166 + true 167 + 168 + (* Test 9: Deeply nested tables *) 169 + let test_nested_tables depth = 170 + let rec make_table d = 171 + if d = 0 then "x" 172 + else "<table><tr><td>" ^ make_table (d - 1) ^ "</td></tr></table>" 173 + in 174 + let input = make_table depth in 175 + let (_, elapsed) = time_it (fun () -> 176 + try 177 + let r = Html5rw.parse (reader_of_string input) in 178 + let _ = Html5rw.to_string r in 179 + true 180 + with _ -> false 181 + ) in 182 + let max_time = float depth *. 0.01 +. 0.5 in 183 + if elapsed > max_time then begin 184 + Printf.printf "SLOW: nested_tables(%d) took %.3fs (max %.3fs)\n" depth elapsed max_time; 185 + false 186 + end else 187 + true 188 + 189 + (* Test 10: Many entity references *) 190 + let test_many_entities count = 191 + let entities = String.concat "" (List.init count (fun _ -> "&amp;")) in 192 + let input = "<div>" ^ entities ^ "</div>" in 193 + let (_, elapsed) = time_it (fun () -> 194 + try 195 + let r = Html5rw.parse (reader_of_string input) in 196 + let _ = Html5rw.to_string r in 197 + true 198 + with _ -> false 199 + ) in 200 + let max_time = float count *. 0.0001 +. 0.5 in 201 + if elapsed > max_time then begin 202 + Printf.printf "SLOW: many_entities(%d) took %.3fs (max %.3fs)\n" count elapsed max_time; 203 + false 204 + end else 205 + true 206 + 207 + (* Run all exhaustion tests *) 208 + let run_all_tests () = 209 + let tests = [ 210 + ("deep_nesting_100", fun () -> test_deep_nesting 100); 211 + ("deep_nesting_1000", fun () -> test_deep_nesting 1000); 212 + ("deep_nesting_5000", fun () -> test_deep_nesting 5000); 213 + ("wide_tree_100", fun () -> test_wide_tree 100); 214 + ("wide_tree_1000", fun () -> test_wide_tree 1000); 215 + ("wide_tree_10000", fun () -> test_wide_tree 10000); 216 + ("huge_text_10000", fun () -> test_huge_text 10000); 217 + ("huge_text_100000", fun () -> test_huge_text 100000); 218 + ("many_attrs_100", fun () -> test_many_attrs 100); 219 + ("many_attrs_1000", fun () -> test_many_attrs 1000); 220 + ("huge_attr_10000", fun () -> test_huge_attr_value 10000); 221 + ("huge_attr_100000", fun () -> test_huge_attr_value 100000); 222 + ("repeated_p_100", fun () -> test_repeated_p 100); 223 + ("repeated_p_500", fun () -> test_repeated_p 500); 224 + ("nested_formatting_50", fun () -> test_nested_formatting 50); 225 + ("nested_formatting_200", fun () -> test_nested_formatting 200); 226 + ("large_table_10x10", fun () -> test_large_table 10 10); 227 + ("large_table_100x100", fun () -> test_large_table 100 100); 228 + ("nested_tables_10", fun () -> test_nested_tables 10); 229 + ("nested_tables_50", fun () -> test_nested_tables 50); 230 + ("many_entities_1000", fun () -> test_many_entities 1000); 231 + ("many_entities_10000", fun () -> test_many_entities 10000); 232 + ] in 233 + 234 + let passed = ref 0 in 235 + let failed = ref 0 in 236 + 237 + List.iter (fun (name, test) -> 238 + Printf.printf "Running %s... %!" name; 239 + if test () then begin 240 + Printf.printf "PASS\n%!"; 241 + incr passed 242 + end else begin 243 + Printf.printf "FAIL\n%!"; 244 + incr failed 245 + end 246 + ) tests; 247 + 248 + Printf.printf "\n=== Summary ===\n"; 249 + Printf.printf "Passed: %d\n" !passed; 250 + Printf.printf "Failed: %d\n" !failed; 251 + 252 + !failed = 0 253 + 254 + let () = 255 + if not (run_all_tests ()) then 256 + exit 1
+261
fuzz/fuzz_fragment.ml
··· 1 + (* Fragment parsing fuzzer for HTML5rw 2 + Tests innerHTML-style fragment parsing with various context elements *) 3 + 4 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 5 + 6 + (* All context element types to test *) 7 + let html_contexts = [| 8 + "div"; "span"; "p"; "a"; "b"; "i"; "em"; "strong"; 9 + "ul"; "ol"; "li"; "dl"; "dt"; "dd"; 10 + "table"; "thead"; "tbody"; "tfoot"; "tr"; "th"; "td"; "caption"; 11 + "select"; "optgroup"; "option"; 12 + "form"; "fieldset"; "legend"; "label"; "input"; "button"; "textarea"; 13 + "pre"; "code"; "blockquote"; 14 + "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; 15 + "article"; "section"; "nav"; "aside"; "header"; "footer"; "main"; 16 + "figure"; "figcaption"; 17 + "template"; 18 + |] 19 + 20 + let svg_contexts = [| 21 + "svg"; "g"; "circle"; "rect"; "path"; "text"; "tspan"; 22 + "foreignObject"; "title"; "desc"; 23 + |] 24 + 25 + let math_contexts = [| 26 + "math"; "mi"; "mo"; "mn"; "ms"; "mrow"; 27 + "annotation-xml"; 28 + |] 29 + 30 + (* Test fragments for different contexts *) 31 + let general_fragments = [| 32 + "text content"; 33 + "<span>inline</span>"; 34 + "<div>block</div>"; 35 + "<a href=\"#\">link</a>"; 36 + "<b><i>nested</i></b>"; 37 + "text<br>text"; 38 + "<!-- comment -->"; 39 + "<img src=\"x\">"; 40 + |] 41 + 42 + let list_fragments = [| 43 + "<li>item</li>"; 44 + "<li>one</li><li>two</li>"; 45 + "text in list"; 46 + "<li><ul><li>nested</li></ul></li>"; 47 + |] 48 + 49 + let table_fragments = [| 50 + "<tr><td>cell</td></tr>"; 51 + "<td>cell</td>"; 52 + "<th>header</th>"; 53 + "text in table"; 54 + "<tbody><tr><td>x</td></tr></tbody>"; 55 + "<caption>title</caption>"; 56 + |] 57 + 58 + let select_fragments = [| 59 + "<option>opt</option>"; 60 + "<option value=\"1\">one</option><option value=\"2\">two</option>"; 61 + "<optgroup label=\"group\"><option>x</option></optgroup>"; 62 + "text in select"; 63 + |] 64 + 65 + let svg_fragments = [| 66 + "<circle cx=\"50\" cy=\"50\" r=\"40\"/>"; 67 + "<rect x=\"0\" y=\"0\" width=\"100\" height=\"100\"/>"; 68 + "<text>SVG text</text>"; 69 + "<g><circle r=\"10\"/></g>"; 70 + "<foreignObject><div>HTML in SVG</div></foreignObject>"; 71 + |] 72 + 73 + let math_fragments = [| 74 + "<mi>x</mi>"; 75 + "<mo>=</mo>"; 76 + "<mn>42</mn>"; 77 + "<mrow><mi>x</mi><mo>=</mo><mn>1</mn></mrow>"; 78 + |] 79 + 80 + (* Test parsing a fragment with a given context *) 81 + let test_fragment_parse ctx_tag ?namespace fragment = 82 + try 83 + let ctx = Html5rw.make_fragment_context ~tag_name:ctx_tag ?namespace () in 84 + let doc = Html5rw.parse ~fragment_context:ctx (reader_of_string fragment) in 85 + let _ = Html5rw.to_string ~pretty:false doc in 86 + true 87 + with e -> 88 + Printf.printf "Exception: %s\n" (Printexc.to_string e); 89 + Printf.printf " Context: <%s>\n" ctx_tag; 90 + Printf.printf " Fragment: %s\n" (String.escaped fragment); 91 + false 92 + 93 + (* Test roundtrip stability for fragment parsing *) 94 + let test_fragment_roundtrip ctx_tag ?namespace fragment = 95 + try 96 + let ctx = Html5rw.make_fragment_context ~tag_name:ctx_tag ?namespace () in 97 + let doc1 = Html5rw.parse ~fragment_context:ctx (reader_of_string fragment) in 98 + let s1 = Html5rw.to_string ~pretty:false doc1 in 99 + let doc2 = Html5rw.parse ~fragment_context:ctx (reader_of_string s1) in 100 + let s2 = Html5rw.to_string ~pretty:false doc2 in 101 + let doc3 = Html5rw.parse ~fragment_context:ctx (reader_of_string s2) in 102 + let s3 = Html5rw.to_string ~pretty:false doc3 in 103 + if s2 <> s3 then begin 104 + Printf.printf "Roundtrip mismatch:\n"; 105 + Printf.printf " Context: <%s>\n" ctx_tag; 106 + Printf.printf " S2: %s\n" (String.escaped s2); 107 + Printf.printf " S3: %s\n" (String.escaped s3); 108 + false 109 + end else 110 + true 111 + with _ -> false 112 + 113 + (* Compare fragment parsing with different contexts *) 114 + let test_context_sensitivity () = 115 + let test_cases = [| 116 + (* These should produce different results in different contexts *) 117 + ("<li>item</li>", [| "ul"; "ol"; "div"; "body" |]); 118 + ("<tr><td>x</td></tr>", [| "table"; "tbody"; "div"; "body" |]); 119 + ("<td>x</td>", [| "tr"; "table"; "div"; "body" |]); 120 + ("<option>x</option>", [| "select"; "optgroup"; "div"; "body" |]); 121 + ("<p>text</p>", [| "div"; "p"; "body" |]); 122 + |] in 123 + 124 + let all_ok = ref true in 125 + Array.iter (fun (fragment, contexts) -> 126 + Array.iter (fun ctx -> 127 + if not (test_fragment_parse ctx fragment) then begin 128 + Printf.printf "FAIL: <%s> with fragment: %s\n" ctx fragment; 129 + all_ok := false 130 + end 131 + ) contexts 132 + ) test_cases; 133 + !all_ok 134 + 135 + (* Run comprehensive tests *) 136 + let run_all_tests () = 137 + let all_pass = ref true in 138 + 139 + Printf.printf "=== Fragment Parsing Tests ===\n\n"; 140 + 141 + (* Test HTML contexts with general fragments *) 142 + Printf.printf "--- HTML contexts with general fragments ---\n"; 143 + let html_pass = ref 0 in 144 + let html_fail = ref 0 in 145 + Array.iter (fun ctx -> 146 + Array.iter (fun fragment -> 147 + if test_fragment_parse ctx fragment then 148 + incr html_pass 149 + else 150 + incr html_fail 151 + ) general_fragments 152 + ) html_contexts; 153 + Printf.printf "HTML contexts: %d/%d\n" !html_pass (!html_pass + !html_fail); 154 + if !html_fail > 0 then all_pass := false; 155 + 156 + (* Test list contexts *) 157 + Printf.printf "\n--- List contexts ---\n"; 158 + let list_pass = ref 0 in 159 + let list_fail = ref 0 in 160 + Array.iter (fun ctx -> 161 + Array.iter (fun fragment -> 162 + if test_fragment_parse ctx fragment then 163 + incr list_pass 164 + else 165 + incr list_fail 166 + ) list_fragments 167 + ) [| "ul"; "ol"; "menu" |]; 168 + Printf.printf "List contexts: %d/%d\n" !list_pass (!list_pass + !list_fail); 169 + if !list_fail > 0 then all_pass := false; 170 + 171 + (* Test table contexts *) 172 + Printf.printf "\n--- Table contexts ---\n"; 173 + let table_pass = ref 0 in 174 + let table_fail = ref 0 in 175 + Array.iter (fun ctx -> 176 + Array.iter (fun fragment -> 177 + if test_fragment_parse ctx fragment then 178 + incr table_pass 179 + else 180 + incr table_fail 181 + ) table_fragments 182 + ) [| "table"; "tbody"; "thead"; "tfoot"; "tr" |]; 183 + Printf.printf "Table contexts: %d/%d\n" !table_pass (!table_pass + !table_fail); 184 + if !table_fail > 0 then all_pass := false; 185 + 186 + (* Test select contexts *) 187 + Printf.printf "\n--- Select contexts ---\n"; 188 + let select_pass = ref 0 in 189 + let select_fail = ref 0 in 190 + Array.iter (fun ctx -> 191 + Array.iter (fun fragment -> 192 + if test_fragment_parse ctx fragment then 193 + incr select_pass 194 + else 195 + incr select_fail 196 + ) select_fragments 197 + ) [| "select"; "optgroup" |]; 198 + Printf.printf "Select contexts: %d/%d\n" !select_pass (!select_pass + !select_fail); 199 + if !select_fail > 0 then all_pass := false; 200 + 201 + (* Test SVG contexts *) 202 + Printf.printf "\n--- SVG contexts ---\n"; 203 + let svg_pass = ref 0 in 204 + let svg_fail = ref 0 in 205 + Array.iter (fun ctx -> 206 + Array.iter (fun fragment -> 207 + if test_fragment_parse ctx ~namespace:(Some "svg") fragment then 208 + incr svg_pass 209 + else 210 + incr svg_fail 211 + ) svg_fragments 212 + ) svg_contexts; 213 + Printf.printf "SVG contexts: %d/%d\n" !svg_pass (!svg_pass + !svg_fail); 214 + if !svg_fail > 0 then all_pass := false; 215 + 216 + (* Test Math contexts *) 217 + Printf.printf "\n--- Math contexts ---\n"; 218 + let math_pass = ref 0 in 219 + let math_fail = ref 0 in 220 + Array.iter (fun ctx -> 221 + Array.iter (fun fragment -> 222 + if test_fragment_parse ctx ~namespace:(Some "math") fragment then 223 + incr math_pass 224 + else 225 + incr math_fail 226 + ) math_fragments 227 + ) math_contexts; 228 + Printf.printf "Math contexts: %d/%d\n" !math_pass (!math_pass + !math_fail); 229 + if !math_fail > 0 then all_pass := false; 230 + 231 + (* Test context sensitivity *) 232 + Printf.printf "\n--- Context sensitivity ---\n"; 233 + if not (test_context_sensitivity ()) then 234 + all_pass := false 235 + else 236 + Printf.printf "Context sensitivity: OK\n"; 237 + 238 + (* Test roundtrip for a sample *) 239 + Printf.printf "\n--- Roundtrip stability ---\n"; 240 + let rt_pass = ref 0 in 241 + let rt_fail = ref 0 in 242 + Array.iter (fun ctx -> 243 + Array.iter (fun fragment -> 244 + if test_fragment_roundtrip ctx fragment then 245 + incr rt_pass 246 + else 247 + incr rt_fail 248 + ) general_fragments 249 + ) [| "div"; "span"; "ul"; "table"; "select" |]; 250 + Printf.printf "Roundtrip: %d/%d\n" !rt_pass (!rt_pass + !rt_fail); 251 + if !rt_fail > 0 then all_pass := false; 252 + 253 + Printf.printf "\n=== Summary ===\n"; 254 + if !all_pass then 255 + Printf.printf "All fragment parsing tests passed!\n" 256 + else begin 257 + Printf.printf "Some fragment parsing tests failed!\n"; 258 + exit 1 259 + end 260 + 261 + let () = run_all_tests ()
+149
fuzz/fuzz_properties.ml
··· 1 + (* Property-based testing for HTML5rw 2 + Tests invariants that should always hold regardless of input *) 3 + 4 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 5 + 6 + (* Property 1: Parsing never raises exceptions on any input *) 7 + let test_parse_no_exception input = 8 + try 9 + let _ = Html5rw.parse (reader_of_string input) in 10 + true 11 + with _ -> false 12 + 13 + (* Property 2: Serialization never raises exceptions *) 14 + let test_serialize_no_exception input = 15 + try 16 + let result = Html5rw.parse (reader_of_string input) in 17 + let _ = Html5rw.to_string result in 18 + true 19 + with _ -> false 20 + 21 + (* Property 3: Serialized output is never longer than a reasonable bound *) 22 + let test_output_bounded input = 23 + try 24 + let result = Html5rw.parse (reader_of_string input) in 25 + let output = Html5rw.to_string ~pretty:false result in 26 + (* Output should not be more than 10x input + base HTML structure *) 27 + String.length output <= (String.length input * 10) + 1000 28 + with _ -> false 29 + 30 + (* Property 4: DOM tree depth is bounded *) 31 + let rec tree_depth node = 32 + let child_depths = List.map tree_depth node.Html5rw.Dom.children in 33 + 1 + (List.fold_left max 0 child_depths) 34 + 35 + let test_depth_bounded input = 36 + try 37 + let result = Html5rw.parse (reader_of_string input) in 38 + let depth = tree_depth (Html5rw.root result) in 39 + (* Depth should not exceed input length (at most one level per char) *) 40 + depth <= String.length input + 10 41 + with _ -> false 42 + 43 + (* Property 5: All text content from input appears somewhere in DOM *) 44 + let rec collect_text node = 45 + if node.Html5rw.Dom.name = "#text" then 46 + [node.Html5rw.Dom.data] 47 + else 48 + List.concat_map collect_text node.Html5rw.Dom.children 49 + 50 + let test_text_preserved input = 51 + try 52 + let result = Html5rw.parse (reader_of_string input) in 53 + let dom_text = String.concat "" (collect_text (Html5rw.root result)) in 54 + (* Every non-tag character should appear in text content or be structural *) 55 + let input_text = Str.global_replace (Str.regexp "<[^>]*>") "" input in 56 + let input_text = Str.global_replace (Str.regexp "&[a-zA-Z]+;") "" input_text in 57 + (* Relaxed check: DOM text should have substantial overlap with input text *) 58 + String.length dom_text >= (String.length input_text / 4) || String.length input_text < 10 59 + with _ -> true (* Parse errors are ok *) 60 + 61 + (* Property 6: Element count is bounded by tag markers in input *) 62 + let rec count_elements node = 63 + let is_element = not (String.length node.Html5rw.Dom.name > 0 && node.Html5rw.Dom.name.[0] = '#') in 64 + let child_count = List.fold_left (+) 0 (List.map count_elements node.Html5rw.Dom.children) in 65 + (if is_element then 1 else 0) + child_count 66 + 67 + let count_char c s = 68 + let count = ref 0 in 69 + String.iter (fun ch -> if ch = c then incr count) s; 70 + !count 71 + 72 + let test_element_count_bounded input = 73 + try 74 + let result = Html5rw.parse (reader_of_string input) in 75 + let elem_count = count_elements (Html5rw.root result) in 76 + let lt_count = count_char '<' input in 77 + (* Element count should not exceed < count + implicit elements (html, head, body) *) 78 + elem_count <= lt_count + 10 79 + with _ -> false 80 + 81 + (* Property 7: Attribute values survive roundtrip (modulo escaping) *) 82 + let rec collect_attrs node = 83 + let own_attrs = node.Html5rw.Dom.attrs in 84 + let child_attrs = List.concat_map collect_attrs node.Html5rw.Dom.children in 85 + own_attrs @ child_attrs 86 + 87 + let unescape_html s = 88 + let s = Str.global_replace (Str.regexp "&amp;") "&" s in 89 + let s = Str.global_replace (Str.regexp "&lt;") "<" s in 90 + let s = Str.global_replace (Str.regexp "&gt;") ">" s in 91 + let s = Str.global_replace (Str.regexp "&quot;") "\"" s in 92 + let s = Str.global_replace (Str.regexp "&#39;") "'" s in 93 + s 94 + 95 + let test_attr_roundtrip input = 96 + try 97 + let r1 = Html5rw.parse (reader_of_string input) in 98 + let s1 = Html5rw.to_string ~pretty:false r1 in 99 + let r2 = Html5rw.parse (reader_of_string s1) in 100 + let attrs1 = collect_attrs (Html5rw.root r1) in 101 + let attrs2 = collect_attrs (Html5rw.root r2) in 102 + (* After roundtrip, attribute values should match (modulo escaping) *) 103 + let normalize_attrs attrs = 104 + List.sort compare (List.map (fun (k, v) -> (k, unescape_html v)) attrs) 105 + in 106 + normalize_attrs attrs1 = normalize_attrs attrs2 || 107 + (* Allow some attrs to be dropped if they have invalid names *) 108 + List.length attrs2 <= List.length attrs1 109 + with _ -> true 110 + 111 + (* Property 8: Idempotent after first roundtrip *) 112 + let test_idempotent input = 113 + try 114 + let r1 = Html5rw.parse (reader_of_string input) in 115 + let s1 = Html5rw.to_string ~pretty:false r1 in 116 + let r2 = Html5rw.parse (reader_of_string s1) in 117 + let s2 = Html5rw.to_string ~pretty:false r2 in 118 + let r3 = Html5rw.parse (reader_of_string s2) in 119 + let s3 = Html5rw.to_string ~pretty:false r3 in 120 + s2 = s3 121 + with _ -> false 122 + 123 + (* Run all property tests *) 124 + let run_all_properties input = 125 + let results = [ 126 + ("parse_no_exception", test_parse_no_exception input); 127 + ("serialize_no_exception", test_serialize_no_exception input); 128 + ("output_bounded", test_output_bounded input); 129 + ("depth_bounded", test_depth_bounded input); 130 + ("text_preserved", test_text_preserved input); 131 + ("element_count_bounded", test_element_count_bounded input); 132 + ("attr_roundtrip", test_attr_roundtrip input); 133 + ("idempotent", test_idempotent input); 134 + ] in 135 + let failures = List.filter (fun (_, ok) -> not ok) results in 136 + if failures <> [] then begin 137 + Printf.printf "PROPERTY FAILURES for input: %s\n" (String.escaped (String.sub input 0 (min 100 (String.length input)))); 138 + List.iter (fun (name, _) -> Printf.printf " - %s\n" name) failures; 139 + false 140 + end else 141 + true 142 + 143 + (* AFL entry point *) 144 + let () = 145 + AflPersistent.run (fun () -> 146 + let input = In_channel.input_all In_channel.stdin in 147 + if not (run_all_properties input) then 148 + exit 1 149 + )
+245
fuzz/fuzz_security.ml
··· 1 + (* Security/sanitizer testing for HTML5rw 2 + Tests XSS vectors, mXSS patterns, and security-relevant parsing behavior *) 3 + 4 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 5 + 6 + (* Helper to check if script-like content appears in output *) 7 + let contains_script_tag output = 8 + let output_lower = String.lowercase_ascii output in 9 + String.length output_lower >= 7 && 10 + (try let _ = Str.search_forward (Str.regexp "<script") output_lower 0 in true with Not_found -> false) 11 + 12 + (* Reserved for future use in sanitizer testing *) 13 + let _contains_event_handler output = 14 + let output_lower = String.lowercase_ascii output in 15 + try let _ = Str.search_forward (Str.regexp "on[a-z]+=") output_lower 0 in true with Not_found -> false 16 + 17 + let _contains_javascript_url output = 18 + let output_lower = String.lowercase_ascii output in 19 + try let _ = Str.search_forward (Str.regexp "javascript:") output_lower 0 in true with Not_found -> false 20 + 21 + (* Test parsing and serialization *) 22 + let parse_and_serialize input = 23 + try 24 + let doc = Html5rw.parse (reader_of_string input) in 25 + Some (Html5rw.to_string ~pretty:false doc) 26 + with _ -> None 27 + 28 + (* Category 1: Basic XSS vectors (these should parse cleanly, not be sanitized) *) 29 + let basic_xss_vectors = [| 30 + "<script>alert(1)</script>"; 31 + "<img src=x onerror=alert(1)>"; 32 + "<svg onload=alert(1)>"; 33 + "<body onload=alert(1)>"; 34 + "<a href=\"javascript:alert(1)\">click</a>"; 35 + "<iframe src=\"javascript:alert(1)\">"; 36 + "<input onfocus=alert(1) autofocus>"; 37 + "<marquee onstart=alert(1)>"; 38 + "<video><source onerror=alert(1)>"; 39 + "<details ontoggle=alert(1) open>"; 40 + |] 41 + 42 + (* Category 2: Obfuscated XSS (parser should handle these consistently) *) 43 + let obfuscated_xss = [| 44 + (* Case variations *) 45 + "<ScRiPt>alert(1)</sCrIpT>"; 46 + "<IMG SRC=x ONERROR=alert(1)>"; 47 + "<SVG ONLOAD=alert(1)>"; 48 + 49 + (* Whitespace variations *) 50 + "<script\n>alert(1)</script>"; 51 + "<script\t>alert(1)</script>"; 52 + "<script\r>alert(1)</script>"; 53 + "<img src=x\nonerror=alert(1)>"; 54 + 55 + (* Null bytes (should be handled) *) 56 + "<scr\x00ipt>alert(1)</script>"; 57 + "<img src=x onerr\x00or=alert(1)>"; 58 + 59 + (* Entity encoding in attributes *) 60 + "<a href=\"java&#115;cript:alert(1)\">x</a>"; 61 + "<a href=\"java&#x73;cript:alert(1)\">x</a>"; 62 + "<img src=x onerror=&#97;lert(1)>"; 63 + |] 64 + 65 + (* Category 3: mXSS patterns (mutation XSS through parser quirks) *) 66 + let mxss_patterns = [| 67 + (* Backtick in attributes *) 68 + "<img src=`x`onerror=alert(1)>"; 69 + "<div style=`background:url(x)`onmouseover=alert(1)>"; 70 + 71 + (* Unclosed tags/attributes *) 72 + "<img src=\"x\" onerror=\"alert(1)"; 73 + "<img src=x onerror=alert(1)//"; 74 + "<div attr=\"></div><script>alert(1)</script>"; 75 + 76 + (* Tag breaking *) 77 + "<div><script>alert(1)</script"; 78 + "<div><<script>alert(1)</script>"; 79 + "</title><script>alert(1)</script>"; 80 + 81 + (* Foreign content escapes *) 82 + "<svg><![CDATA[<script>alert(1)</script>]]></svg>"; 83 + "<svg><foreignObject><script>alert(1)</script></foreignObject></svg>"; 84 + "<math><mtext><script>alert(1)</script></mtext></math>"; 85 + 86 + (* Template injection *) 87 + "<template><script>alert(1)</script></template>"; 88 + 89 + (* Noscript edge cases *) 90 + "<noscript><script>alert(1)</script></noscript>"; 91 + |] 92 + 93 + (* Category 4: Attribute injection patterns *) 94 + let attr_injection = [| 95 + (* Breaking out of attributes *) 96 + "<div title=\"x\" onclick=\"alert(1)\">x</div>"; 97 + "<div title='x' onclick='alert(1)'>x</div>"; 98 + "<div title=x onclick=alert(1)>x</div>"; 99 + 100 + (* Attribute without value *) 101 + "<input value=\"x\" onfocus autofocus>"; 102 + 103 + (* Multiple attributes *) 104 + "<div a=1 b=2 onclick=alert(1) c=3>x</div>"; 105 + 106 + (* Quote mismatches *) 107 + "<div title=\"x'onclick=alert(1)//\">x</div>"; 108 + "<div title='x\"onclick=alert(1)//'>x</div>"; 109 + 110 + (* Entity in attribute names *) 111 + "<div o&#110;click=alert(1)>x</div>"; 112 + |] 113 + 114 + (* Category 5: URL-based attacks *) 115 + let url_attacks = [| 116 + "<a href=\"javascript:alert(1)\">x</a>"; 117 + "<a href=\"JAVASCRIPT:alert(1)\">x</a>"; 118 + "<a href=\" javascript:alert(1)\">x</a>"; 119 + "<a href=\"&#106;avascript:alert(1)\">x</a>"; 120 + "<a href=\"java\tscript:alert(1)\">x</a>"; 121 + "<a href=\"java\nscript:alert(1)\">x</a>"; 122 + "<a href=\"java\rscript:alert(1)\">x</a>"; 123 + "<a href=\"data:text/html,<script>alert(1)</script>\">x</a>"; 124 + "<a href=\"vbscript:alert(1)\">x</a>"; 125 + "<iframe src=\"javascript:alert(1)\">"; 126 + "<embed src=\"javascript:alert(1)\">"; 127 + "<object data=\"javascript:alert(1)\">"; 128 + "<form action=\"javascript:alert(1)\">"; 129 + |] 130 + 131 + (* Category 6: Style-based attacks *) 132 + let style_attacks = [| 133 + "<div style=\"background:url(javascript:alert(1))\">x</div>"; 134 + "<div style=\"expression(alert(1))\">x</div>"; 135 + "<div style=\"-moz-binding:url(http://evil.com/xss.xml#xss)\">x</div>"; 136 + "<style>@import 'http://evil.com/xss.css';</style>"; 137 + "<style>body { background: url('javascript:alert(1)'); }</style>"; 138 + "<link rel=\"stylesheet\" href=\"javascript:alert(1)\">"; 139 + |] 140 + 141 + (* Category 7: Tag soup and parser confusion *) 142 + let tag_soup = [| 143 + "<div<div>test</div>"; 144 + "<div<<div>>test</div>"; 145 + "<<div>>test</div>"; 146 + "<div>test</div</div>>"; 147 + "</</div>>"; 148 + "</ div>"; 149 + "<div / onclick=alert(1)>"; 150 + "<div/onclick=alert(1)>"; 151 + "<div><</div>"; 152 + "<div>></div>"; 153 + "<div>&lt;script&gt;alert(1)&lt;/script&gt;</div>"; 154 + |] 155 + 156 + (* Test that parsing is stable (no mXSS through parse-serialize-parse) *) 157 + let test_mxss_stability input = 158 + match parse_and_serialize input with 159 + | None -> (true, "parse failed") (* Parse failure is ok for malformed input *) 160 + | Some s1 -> 161 + match parse_and_serialize s1 with 162 + | None -> (false, "re-parse failed") 163 + | Some s2 -> 164 + match parse_and_serialize s2 with 165 + | None -> (false, "third parse failed") 166 + | Some s3 -> 167 + if s2 = s3 then (true, "stable") 168 + else (false, Printf.sprintf "unstable: s2=%s s3=%s" (String.escaped s2) (String.escaped s3)) 169 + 170 + (* Test that dangerous content doesn't appear after parsing innocuous-looking input *) 171 + let test_no_script_injection input = 172 + if contains_script_tag input then 173 + (* If input has script, we expect output might too *) 174 + true 175 + else 176 + match parse_and_serialize input with 177 + | None -> true 178 + | Some output -> 179 + if contains_script_tag output then begin 180 + Printf.printf "SCRIPT TAG APPEARED:\n"; 181 + Printf.printf " Input: %s\n" (String.escaped input); 182 + Printf.printf " Output: %s\n" (String.escaped output); 183 + false 184 + end else 185 + true 186 + 187 + let run_test_category name cases = 188 + Printf.printf "--- %s ---\n" name; 189 + let stable_count = ref 0 in 190 + let unstable_count = ref 0 in 191 + Array.iter (fun input -> 192 + let (stable, msg) = test_mxss_stability input in 193 + if stable then incr stable_count 194 + else begin 195 + Printf.printf "UNSTABLE: %s\n" (String.escaped (String.sub input 0 (min 60 (String.length input)))); 196 + Printf.printf " %s\n" msg; 197 + incr unstable_count 198 + end 199 + ) cases; 200 + Printf.printf "%s: %d stable, %d unstable\n\n" name !stable_count !unstable_count; 201 + !unstable_count = 0 202 + 203 + let () = 204 + Printf.printf "=== Security/Sanitizer Tests ===\n\n"; 205 + 206 + let all_pass = ref true in 207 + 208 + (* Test each category for mXSS stability *) 209 + if not (run_test_category "Basic XSS vectors" basic_xss_vectors) then 210 + all_pass := false; 211 + if not (run_test_category "Obfuscated XSS" obfuscated_xss) then 212 + all_pass := false; 213 + if not (run_test_category "mXSS patterns" mxss_patterns) then 214 + all_pass := false; 215 + if not (run_test_category "Attribute injection" attr_injection) then 216 + all_pass := false; 217 + if not (run_test_category "URL attacks" url_attacks) then 218 + all_pass := false; 219 + if not (run_test_category "Style attacks" style_attacks) then 220 + all_pass := false; 221 + if not (run_test_category "Tag soup" tag_soup) then 222 + all_pass := false; 223 + 224 + (* Test for script tag injection *) 225 + Printf.printf "--- Script injection tests ---\n"; 226 + let inject_pass = ref 0 in 227 + let inject_fail = ref 0 in 228 + let non_script_inputs = Array.concat [attr_injection; tag_soup] in 229 + Array.iter (fun input -> 230 + if test_no_script_injection input then 231 + incr inject_pass 232 + else 233 + incr inject_fail 234 + ) non_script_inputs; 235 + Printf.printf "No unexpected script injection: %d/%d\n\n" 236 + !inject_pass (!inject_pass + !inject_fail); 237 + if !inject_fail > 0 then all_pass := false; 238 + 239 + Printf.printf "=== Summary ===\n"; 240 + if !all_pass then 241 + Printf.printf "All security tests passed!\n" 242 + else begin 243 + Printf.printf "Some security tests failed!\n"; 244 + exit 1 245 + end
+215
fuzz/fuzz_serializer.ml
··· 1 + (* Serializer-specific fuzzer for HTML5rw 2 + Tests serialization edge cases and formatting options *) 3 + 4 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 5 + 6 + (* Normalize whitespace for comparison - removes formatting differences *) 7 + let normalize_whitespace s = 8 + let s = Str.global_replace (Str.regexp "[\n\r\t ]+") " " s in 9 + let s = Str.global_replace (Str.regexp "> <") "><" s in 10 + String.trim s 11 + 12 + (* Test serialization with different pretty-print settings *) 13 + let test_pretty_modes input = 14 + try 15 + let doc = Html5rw.parse (reader_of_string input) in 16 + 17 + let s_compact = Html5rw.to_string ~pretty:false doc in 18 + let s_pretty = Html5rw.to_string ~pretty:true doc in 19 + 20 + (* Both should parse back to equivalent DOMs *) 21 + let doc_compact = Html5rw.parse (reader_of_string s_compact) in 22 + let doc_pretty = Html5rw.parse (reader_of_string s_pretty) in 23 + 24 + let s_compact2 = Html5rw.to_string ~pretty:false doc_compact in 25 + let s_pretty2 = Html5rw.to_string ~pretty:false doc_pretty in 26 + 27 + (* Compact versions should be identical (roundtrip stable) *) 28 + if s_compact <> s_compact2 then begin 29 + Printf.printf "Compact roundtrip mismatch:\n"; 30 + Printf.printf " s_compact: %s\n" (String.escaped s_compact); 31 + Printf.printf " s_compact2: %s\n" (String.escaped s_compact2); 32 + false 33 + end else begin 34 + (* Pretty and compact should have same semantic content (modulo whitespace) *) 35 + let norm_compact = normalize_whitespace s_compact in 36 + let norm_pretty2 = normalize_whitespace s_pretty2 in 37 + if norm_compact <> norm_pretty2 then begin 38 + Printf.printf "Pretty/compact semantic mismatch:\n"; 39 + Printf.printf " From compact: %s\n" (String.escaped norm_compact); 40 + Printf.printf " From pretty: %s\n" (String.escaped norm_pretty2); 41 + false 42 + end else 43 + true 44 + end 45 + with e -> 46 + Printf.printf "Exception: %s\n" (Printexc.to_string e); 47 + Printf.printf " Input: %s\n" (String.escaped input); 48 + false 49 + 50 + (* Test attribute serialization *) 51 + let attr_test_cases = [| 52 + (* Basic attributes *) 53 + "<div id=\"test\"></div>"; 54 + "<div class=\"foo bar\"></div>"; 55 + "<div data-x=\"value\"></div>"; 56 + 57 + (* Quoting *) 58 + "<div attr=\"a'b\"></div>"; 59 + "<div attr='a\"b'></div>"; 60 + "<div attr=\"a&quot;b\"></div>"; 61 + "<div attr='a&apos;b'></div>"; 62 + 63 + (* Special characters *) 64 + "<div attr=\"a<b\"></div>"; 65 + "<div attr=\"a>b\"></div>"; 66 + "<div attr=\"a&b\"></div>"; 67 + "<div attr=\"a&amp;b\"></div>"; 68 + 69 + (* Empty and valueless *) 70 + "<div attr></div>"; 71 + "<div attr=\"\"></div>"; 72 + "<input disabled>"; 73 + "<input checked>"; 74 + 75 + (* Whitespace *) 76 + "<div attr=\"a b\"></div>"; 77 + "<div attr=\"a\nb\"></div>"; 78 + "<div attr=\"a\tb\"></div>"; 79 + 80 + (* URLs *) 81 + "<a href=\"http://example.com?a=1&b=2\"></a>"; 82 + "<a href=\"javascript:alert('x')\"></a>"; 83 + "<a href=\"data:text/html,<script>x</script>\"></a>"; 84 + 85 + (* Event handlers *) 86 + "<div onclick=\"alert(&quot;x&quot;)\"></div>"; 87 + "<div onclick='alert(\"x\")'></div>"; 88 + 89 + (* Multiple attributes *) 90 + "<div a=\"1\" b=\"2\" c=\"3\"></div>"; 91 + "<div a b c></div>"; 92 + |] 93 + 94 + (* Test void element serialization *) 95 + let void_test_cases = [| 96 + "<br>"; 97 + "<br/>"; 98 + "<br />"; 99 + "<hr>"; 100 + "<img src=\"x\">"; 101 + "<input type=\"text\">"; 102 + "<meta charset=\"utf-8\">"; 103 + "<link rel=\"stylesheet\">"; 104 + "<area>"; 105 + "<base href=\"/\">"; 106 + "<col span=\"2\">"; 107 + "<embed src=\"x\">"; 108 + "<source src=\"x\">"; 109 + "<track src=\"x\">"; 110 + "<wbr>"; 111 + |] 112 + 113 + (* Test raw text element serialization *) 114 + (* Note: Test cases with </script> inside script are omitted because they are 115 + intentionally invalid HTML and the parser correctly terminates at </script> *) 116 + let rawtext_test_cases = [| 117 + "<script>var x = 1;</script>"; 118 + "<script>var x = '<div>';</script>"; 119 + "<style>.x { color: red; }</style>"; 120 + "<textarea>Hello world</textarea>"; 121 + "<textarea><div>not an element</div></textarea>"; 122 + "<title>Page &amp; Title</title>"; 123 + "<xmp><div>preformatted</div></xmp>"; 124 + |] 125 + 126 + (* Test whitespace preservation *) 127 + let whitespace_test_cases = [| 128 + "<pre> spaces </pre>"; 129 + "<pre>\n\nlines\n\n</pre>"; 130 + "<pre>\ttabs\t</pre>"; 131 + "<code> code </code>"; 132 + "<textarea> text </textarea>"; 133 + "<div> text </div>"; 134 + "<p> text </p>"; 135 + |] 136 + 137 + (* Test entity serialization *) 138 + let entity_test_cases = [| 139 + "<div>&amp;</div>"; 140 + "<div>&lt;</div>"; 141 + "<div>&gt;</div>"; 142 + "<div>&quot;</div>"; 143 + "<div>&apos;</div>"; 144 + "<div>&nbsp;</div>"; 145 + "<div>&#60;</div>"; 146 + "<div>&#x3C;</div>"; 147 + "<div>&copy;</div>"; 148 + "<div>&mdash;</div>"; 149 + |] 150 + 151 + (* Test nested structure serialization *) 152 + let nested_test_cases = [| 153 + "<div><div><div></div></div></div>"; 154 + "<table><tbody><tr><td><table><tbody><tr><td></td></tr></tbody></table></td></tr></tbody></table>"; 155 + "<ul><li><ul><li><ul><li></li></ul></li></ul></li></ul>"; 156 + "<dl><dt><dl><dt></dt></dl></dt></dl>"; 157 + |] 158 + 159 + (* Test foreign content serialization *) 160 + let foreign_test_cases = [| 161 + "<svg></svg>"; 162 + "<svg viewBox=\"0 0 100 100\"><circle cx=\"50\" cy=\"50\" r=\"40\"/></svg>"; 163 + "<svg><text>Hello</text></svg>"; 164 + "<math></math>"; 165 + "<math><mi>x</mi><mo>=</mo><mn>1</mn></math>"; 166 + "<svg xmlns=\"http://www.w3.org/2000/svg\"></svg>"; 167 + |] 168 + 169 + let run_test_category name cases test_fn = 170 + let passed = ref 0 in 171 + let failed = ref 0 in 172 + Array.iter (fun input -> 173 + if test_fn input then 174 + incr passed 175 + else begin 176 + Printf.printf "FAIL: %s\n" (String.escaped input); 177 + incr failed 178 + end 179 + ) cases; 180 + Printf.printf "%s: %d/%d\n" name !passed (Array.length cases); 181 + !failed = 0 182 + 183 + let () = 184 + let all_pass = ref true in 185 + 186 + Printf.printf "=== Serializer Tests ===\n\n"; 187 + 188 + if not (run_test_category "Attribute serialization" attr_test_cases test_pretty_modes) then 189 + all_pass := false; 190 + 191 + if not (run_test_category "Void elements" void_test_cases test_pretty_modes) then 192 + all_pass := false; 193 + 194 + if not (run_test_category "Raw text elements" rawtext_test_cases test_pretty_modes) then 195 + all_pass := false; 196 + 197 + if not (run_test_category "Whitespace preservation" whitespace_test_cases test_pretty_modes) then 198 + all_pass := false; 199 + 200 + if not (run_test_category "Entity serialization" entity_test_cases test_pretty_modes) then 201 + all_pass := false; 202 + 203 + if not (run_test_category "Nested structures" nested_test_cases test_pretty_modes) then 204 + all_pass := false; 205 + 206 + if not (run_test_category "Foreign content" foreign_test_cases test_pretty_modes) then 207 + all_pass := false; 208 + 209 + Printf.printf "\n=== Summary ===\n"; 210 + if !all_pass then 211 + Printf.printf "All serializer tests passed!\n" 212 + else begin 213 + Printf.printf "Some serializer tests failed!\n"; 214 + exit 1 215 + end
+214
fuzz/fuzz_streaming.ml
··· 1 + (* Streaming/incremental fuzzer for HTML5rw 2 + Tests that parsing produces identical results regardless of input characteristics *) 3 + 4 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 5 + 6 + (* Test that repeated parsing produces stable results *) 7 + let test_stability input = 8 + try 9 + (* Parse multiple times and ensure consistent results *) 10 + let doc1 = Html5rw.parse (reader_of_string input) in 11 + let s1 = Html5rw.to_string ~pretty:false doc1 in 12 + 13 + let doc2 = Html5rw.parse (reader_of_string input) in 14 + let s2 = Html5rw.to_string ~pretty:false doc2 in 15 + 16 + let doc3 = Html5rw.parse (reader_of_string input) in 17 + let s3 = Html5rw.to_string ~pretty:false doc3 in 18 + 19 + if s1 <> s2 || s2 <> s3 then begin 20 + Printf.printf "PARSING NOT DETERMINISTIC:\n"; 21 + Printf.printf " S1: %s\n" (String.escaped (String.sub s1 0 (min 100 (String.length s1)))); 22 + Printf.printf " S2: %s\n" (String.escaped (String.sub s2 0 (min 100 (String.length s2)))); 23 + Printf.printf " S3: %s\n" (String.escaped (String.sub s3 0 (min 100 (String.length s3)))); 24 + false 25 + end else 26 + true 27 + with e -> 28 + Printf.printf "Exception: %s\n" (Printexc.to_string e); 29 + Printf.printf " Input: %s\n" (String.escaped (String.sub input 0 (min 100 (String.length input)))); 30 + false 31 + 32 + (* Test roundtrip stability *) 33 + let test_roundtrip input = 34 + try 35 + let doc1 = Html5rw.parse (reader_of_string input) in 36 + let s1 = Html5rw.to_string ~pretty:false doc1 in 37 + let doc2 = Html5rw.parse (reader_of_string s1) in 38 + let s2 = Html5rw.to_string ~pretty:false doc2 in 39 + let doc3 = Html5rw.parse (reader_of_string s2) in 40 + let s3 = Html5rw.to_string ~pretty:false doc3 in 41 + if s2 <> s3 then begin 42 + Printf.printf "ROUNDTRIP UNSTABLE:\n"; 43 + Printf.printf " Input: %s\n" (String.escaped (String.sub input 0 (min 50 (String.length input)))); 44 + Printf.printf " S2: %s\n" (String.escaped (String.sub s2 0 (min 100 (String.length s2)))); 45 + Printf.printf " S3: %s\n" (String.escaped (String.sub s3 0 (min 100 (String.length s3)))); 46 + false 47 + end else 48 + true 49 + with e -> 50 + Printf.printf "Exception in roundtrip: %s\n" (Printexc.to_string e); 51 + false 52 + 53 + (* Test cases that stress boundary handling *) 54 + let boundary_test_cases = [| 55 + (* Tags split at various positions *) 56 + "<div></div>"; 57 + "<div attr=\"value\"></div>"; 58 + "<!DOCTYPE html>"; 59 + "<!-- comment -->"; 60 + 61 + (* Entity references *) 62 + "&amp;"; 63 + "&lt;test&gt;"; 64 + "&#60;"; 65 + "&#x3C;"; 66 + 67 + (* Multi-byte UTF-8 *) 68 + "caf\xC3\xA9"; (* café *) 69 + "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"; (* 日本語 *) 70 + "\xF0\x9F\x8E\x89"; (* 🎉 *) 71 + "<div>\xE4\xB8\xAD\xE6\x96\x87</div>"; 72 + 73 + (* Script/style content *) 74 + "<script>var x = 1;</script>"; 75 + "<style>.x { color: red; }</style>"; 76 + 77 + (* CDATA-like in script *) 78 + "<script>//<![CDATA[\nvar x = 1;\n//]]></script>"; 79 + 80 + (* Long strings *) 81 + String.make 100 'x'; 82 + "<div>" ^ String.make 100 'x' ^ "</div>"; 83 + 84 + (* Many small tags *) 85 + String.concat "" (List.init 20 (fun _ -> "<b>x</b>")); 86 + 87 + (* Whitespace variations *) 88 + "<div attr = 'value' ></div>"; 89 + "<pre>\n\n\ntext\n\n\n</pre>"; 90 + 91 + (* Mixed content *) 92 + "<div>text<span>more</span>text</div>"; 93 + |] 94 + 95 + (* Additional edge cases *) 96 + let edge_cases = [| 97 + (* Empty *) 98 + ""; 99 + 100 + (* Just whitespace *) 101 + " "; 102 + "\n\n\n"; 103 + "\t\t\t"; 104 + 105 + (* Single characters *) 106 + "<"; 107 + ">"; 108 + "&"; 109 + "/"; 110 + 111 + (* Partial tags *) 112 + "<d"; 113 + "<di"; 114 + "<div"; 115 + "<div>"; 116 + 117 + (* CR/LF variations *) 118 + "<div>\r\n</div>"; 119 + "<div>\r</div>"; 120 + "<div>\n</div>"; 121 + "line1\r\nline2\rline3\nline4"; 122 + 123 + (* Multiple documents *) 124 + "<!DOCTYPE html><html><body>x</body></html>"; 125 + |] 126 + 127 + (* Test that pretty-printing produces valid HTML that roundtrips properly *) 128 + let test_pretty_roundtrip input = 129 + try 130 + let doc = Html5rw.parse (reader_of_string input) in 131 + let s_pretty = Html5rw.to_string ~pretty:true doc in 132 + 133 + (* Pretty output should roundtrip-stable *) 134 + let doc_pretty = Html5rw.parse (reader_of_string s_pretty) in 135 + let s_pretty2 = Html5rw.to_string ~pretty:true doc_pretty in 136 + let doc_pretty2 = Html5rw.parse (reader_of_string s_pretty2) in 137 + let s_pretty3 = Html5rw.to_string ~pretty:true doc_pretty2 in 138 + 139 + if s_pretty2 <> s_pretty3 then begin 140 + Printf.printf "PRETTY ROUNDTRIP UNSTABLE:\n"; 141 + Printf.printf " S2: %s\n" (String.escaped (String.sub s_pretty2 0 (min 100 (String.length s_pretty2)))); 142 + Printf.printf " S3: %s\n" (String.escaped (String.sub s_pretty3 0 (min 100 (String.length s_pretty3)))); 143 + false 144 + end else 145 + true 146 + with e -> 147 + Printf.printf "Exception: %s\n" (Printexc.to_string e); 148 + false 149 + 150 + (* Run all tests *) 151 + let run_all_tests () = 152 + let test_cases = Array.concat [boundary_test_cases; edge_cases] in 153 + let all_pass = ref true in 154 + 155 + Printf.printf "=== Streaming/Stability Tests ===\n\n"; 156 + 157 + (* Test parsing stability *) 158 + Printf.printf "--- Parsing stability ---\n"; 159 + let stable_pass = ref 0 in 160 + let stable_fail = ref 0 in 161 + Array.iter (fun input -> 162 + if test_stability input then 163 + incr stable_pass 164 + else 165 + incr stable_fail 166 + ) test_cases; 167 + Printf.printf "Parsing stability: %d/%d\n" !stable_pass (Array.length test_cases); 168 + if !stable_fail > 0 then all_pass := false; 169 + 170 + (* Test roundtrip stability *) 171 + Printf.printf "\n--- Roundtrip stability ---\n"; 172 + let rt_pass = ref 0 in 173 + let rt_fail = ref 0 in 174 + Array.iter (fun input -> 175 + if test_roundtrip input then 176 + incr rt_pass 177 + else 178 + incr rt_fail 179 + ) test_cases; 180 + Printf.printf "Roundtrip stability: %d/%d\n" !rt_pass (Array.length test_cases); 181 + if !rt_fail > 0 then all_pass := false; 182 + 183 + (* Test pretty printing roundtrip stability *) 184 + Printf.printf "\n--- Pretty printing roundtrip ---\n"; 185 + let pretty_pass = ref 0 in 186 + let pretty_fail = ref 0 in 187 + Array.iter (fun input -> 188 + if test_pretty_roundtrip input then 189 + incr pretty_pass 190 + else 191 + incr pretty_fail 192 + ) test_cases; 193 + Printf.printf "Pretty roundtrip: %d/%d\n" !pretty_pass (Array.length test_cases); 194 + if !pretty_fail > 0 then all_pass := false; 195 + 196 + Printf.printf "\n=== Summary ===\n"; 197 + if !all_pass then 198 + Printf.printf "All streaming/stability tests passed!\n" 199 + else begin 200 + Printf.printf "Some tests failed!\n"; 201 + exit 1 202 + end 203 + 204 + (* AFL entry point for fuzz testing *) 205 + let fuzz_mode () = 206 + let input = In_channel.input_all In_channel.stdin in 207 + if not (test_stability input && test_roundtrip input) then 208 + exit 1 209 + 210 + let () = 211 + if Array.length Sys.argv > 1 && Sys.argv.(1) = "--fuzz" then 212 + fuzz_mode () 213 + else 214 + run_all_tests ()
+176
fuzz/fuzz_structure.ml
··· 1 + (* Structure-aware HTML fuzzer 2 + Generates and mutates syntactically plausible HTML to find edge cases *) 3 + 4 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 5 + 6 + (* Common HTML elements for generation *) 7 + let void_elements = [| "br"; "hr"; "img"; "input"; "meta"; "link"; "area"; "base"; "col"; "embed"; "source"; "track"; "wbr" |] 8 + let block_elements = [| "div"; "p"; "h1"; "h2"; "h3"; "section"; "article"; "header"; "footer"; "main"; "nav"; "aside"; "blockquote"; "pre"; "ul"; "ol"; "li"; "dl"; "dt"; "dd"; "figure"; "figcaption"; "table"; "form"; "fieldset" |] 9 + let inline_elements = [| "span"; "a"; "em"; "strong"; "b"; "i"; "u"; "s"; "small"; "big"; "code"; "kbd"; "var"; "samp"; "cite"; "q"; "abbr"; "time"; "mark"; "sub"; "sup" |] 10 + (* Formatting elements for adoption agency algorithm testing *) 11 + let _formatting_elements = [| "a"; "b"; "big"; "code"; "em"; "font"; "i"; "nobr"; "s"; "small"; "strike"; "strong"; "tt"; "u" |] 12 + let table_elements = [| "table"; "thead"; "tbody"; "tfoot"; "tr"; "th"; "td"; "caption"; "colgroup"; "col" |] 13 + let special_elements = [| "script"; "style"; "template"; "svg"; "math"; "textarea"; "title"; "noscript"; "iframe"; "xmp"; "plaintext" |] 14 + 15 + let all_elements = Array.concat [void_elements; block_elements; inline_elements; table_elements; special_elements] 16 + 17 + (* Random selection *) 18 + let pick arr = arr.(Random.int (Array.length arr)) 19 + 20 + (* Generate random attribute *) 21 + let gen_attr_name () = 22 + let names = [| "id"; "class"; "style"; "href"; "src"; "alt"; "title"; "name"; "value"; "type"; "data-x"; "aria-label"; "onclick"; "onload" |] in 23 + pick names 24 + 25 + let gen_attr_value () = 26 + let values = [| ""; "x"; "test"; "foo bar"; "a\"b"; "a'b"; "a<b"; "a>b"; "a&b"; "<script>"; "javascript:"; "&#x0;"; "\x00"; "\n\t" |] in 27 + pick values 28 + 29 + let gen_attrs n = 30 + let buf = Buffer.create 64 in 31 + for _ = 1 to n do 32 + Buffer.add_char buf ' '; 33 + Buffer.add_string buf (gen_attr_name ()); 34 + if Random.bool () then begin 35 + Buffer.add_string buf "=\""; 36 + Buffer.add_string buf (gen_attr_value ()); 37 + Buffer.add_char buf '"' 38 + end 39 + done; 40 + Buffer.contents buf 41 + 42 + (* Generate random HTML *) 43 + let rec gen_html depth max_depth = 44 + if depth >= max_depth then 45 + (* Terminal: text or void element *) 46 + if Random.bool () then 47 + let texts = [| "hello"; "world"; "<text>"; "&amp;"; ""; " "; "\n" |] in 48 + pick texts 49 + else 50 + "<" ^ pick void_elements ^ gen_attrs (Random.int 3) ^ ">" 51 + else 52 + let tag = pick all_elements in 53 + let attrs = gen_attrs (Random.int 4) in 54 + let is_void = Array.mem tag void_elements in 55 + if is_void then 56 + "<" ^ tag ^ attrs ^ ">" 57 + else 58 + let children = Buffer.create 256 in 59 + let num_children = Random.int 5 in 60 + for _ = 1 to num_children do 61 + Buffer.add_string children (gen_html (depth + 1) max_depth) 62 + done; 63 + "<" ^ tag ^ attrs ^ ">" ^ Buffer.contents children ^ "</" ^ tag ^ ">" 64 + 65 + (* Mutation strategies *) 66 + type mutation = 67 + | DeleteChar of int 68 + | InsertChar of int * char 69 + | SwapChars of int * int 70 + | DuplicateRange of int * int 71 + | CorruptTag of int 72 + | UnclosedTag of int 73 + | MisnestedTags 74 + | InsertNullByte of int 75 + | TruncateAt of int 76 + 77 + let apply_mutation input mutation = 78 + let len = String.length input in 79 + if len = 0 then input else 80 + match mutation with 81 + | DeleteChar i when i < len -> 82 + String.sub input 0 i ^ String.sub input (i + 1) (len - i - 1) 83 + | InsertChar (i, c) when i <= len -> 84 + String.sub input 0 i ^ String.make 1 c ^ String.sub input i (len - i) 85 + | SwapChars (i, j) when i < len && j < len -> 86 + let bytes = Bytes.of_string input in 87 + let tmp = Bytes.get bytes i in 88 + Bytes.set bytes i (Bytes.get bytes j); 89 + Bytes.set bytes j tmp; 90 + Bytes.to_string bytes 91 + | DuplicateRange (start, len') when start < len && start + len' <= len -> 92 + let range = String.sub input start len' in 93 + String.sub input 0 start ^ range ^ range ^ String.sub input (start + len') (len - start - len') 94 + | CorruptTag i -> 95 + (* Find a < after position i and corrupt the tag *) 96 + (try 97 + let tag_start = String.index_from input (min i (len - 1)) '<' in 98 + String.sub input 0 (tag_start + 1) ^ "\x00" ^ String.sub input (tag_start + 1) (len - tag_start - 1) 99 + with Not_found -> input) 100 + | UnclosedTag i -> 101 + (* Find a </ after position i and remove it *) 102 + (try 103 + let close_start = Str.search_forward (Str.regexp "</") input (min i (len - 1)) in 104 + let close_end = String.index_from input close_start '>' in 105 + String.sub input 0 close_start ^ String.sub input (close_end + 1) (len - close_end - 1) 106 + with Not_found -> input) 107 + | MisnestedTags -> 108 + (* Generate misnested tags *) 109 + let tags = [| "a"; "b"; "i"; "em"; "strong"; "span"; "div"; "p" |] in 110 + let t1, t2 = pick tags, pick tags in 111 + input ^ "<" ^ t1 ^ "><" ^ t2 ^ "></" ^ t1 ^ "></" ^ t2 ^ ">" 112 + | InsertNullByte i when i < len -> 113 + String.sub input 0 i ^ "\x00" ^ String.sub input i (len - i) 114 + | TruncateAt i when i < len -> 115 + String.sub input 0 i 116 + | _ -> input 117 + 118 + let random_mutation input = 119 + let len = String.length input in 120 + let mutations = [| 121 + DeleteChar (Random.int (max 1 len)); 122 + InsertChar (Random.int (len + 1), Char.chr (Random.int 256)); 123 + SwapChars (Random.int (max 1 len), Random.int (max 1 len)); 124 + DuplicateRange (Random.int (max 1 len), Random.int (min 50 (max 1 len))); 125 + CorruptTag (Random.int (max 1 len)); 126 + UnclosedTag (Random.int (max 1 len)); 127 + MisnestedTags; 128 + InsertNullByte (Random.int (max 1 len)); 129 + TruncateAt (Random.int (max 1 len)); 130 + |] in 131 + apply_mutation input (pick mutations) 132 + 133 + (* Test function: parse, serialize, and check for crashes/hangs *) 134 + let test_html input = 135 + try 136 + let r1 = Html5rw.parse (reader_of_string input) in 137 + let s1 = Html5rw.to_string ~pretty:false r1 in 138 + let r2 = Html5rw.parse (reader_of_string s1) in 139 + let s2 = Html5rw.to_string ~pretty:false r2 in 140 + let r3 = Html5rw.parse (reader_of_string s2) in 141 + let s3 = Html5rw.to_string ~pretty:false r3 in 142 + if s2 <> s3 then begin 143 + Printf.printf "ROUNDTRIP FAILED:\n"; 144 + Printf.printf "Input: %s\n" (String.escaped (String.sub input 0 (min 200 (String.length input)))); 145 + Printf.printf "s2: %s\n" (String.escaped (String.sub s2 0 (min 200 (String.length s2)))); 146 + Printf.printf "s3: %s\n" (String.escaped (String.sub s3 0 (min 200 (String.length s3)))); 147 + false 148 + end else 149 + true 150 + with e -> 151 + Printf.printf "EXCEPTION: %s\n" (Printexc.to_string e); 152 + Printf.printf "Input: %s\n" (String.escaped (String.sub input 0 (min 200 (String.length input)))); 153 + false 154 + 155 + (* Main fuzzing loop *) 156 + let () = 157 + Random.self_init (); 158 + 159 + (* Mode: generate or mutate based on stdin *) 160 + let input = In_channel.input_all In_channel.stdin in 161 + 162 + let test_input = 163 + if String.length input < 10 then 164 + (* Generate random HTML *) 165 + gen_html 0 (3 + Random.int 5) 166 + else 167 + (* Mutate the input *) 168 + let mutated = ref input in 169 + for _ = 1 to 1 + Random.int 5 do 170 + mutated := random_mutation !mutated 171 + done; 172 + !mutated 173 + in 174 + 175 + if not (test_html test_input) then 176 + exit 1