OCaml HTML5 parser/serialiser based on Python's JustHTML

Compare changes

Choose any two refs to compare.

+4167 -189
-9
.claude/settings.local.json
··· 1 - { 2 - "permissions": { 3 - "allow": [ 4 - "Bash(git checkout:*)", 5 - "Bash(git add:*)", 6 - "Bash(git commit:*)" 7 - ] 8 - } 9 - }
···
+659
OCAML-FUZZING.md
···
··· 1 + # OCaml Fuzzing with Crowbar 2 + 3 + This guide covers setting up property-based fuzz testing for OCaml libraries using [Crowbar](https://github.com/stedolan/crowbar). Crowbar supports two modes: 4 + 5 + 1. **QuickCheck mode** - Fast, deterministic testing without AFL (good for CI) 6 + 2. **AFL mode** - Coverage-guided fuzzing with American Fuzzy Lop 7 + 8 + ## Quick Start 9 + 10 + ### 1. Add Dependencies 11 + 12 + Add `crowbar` to your opam dependencies: 13 + 14 + ```bash 15 + opam install crowbar 16 + ``` 17 + 18 + ### 2. Create Fuzz Test Directory 19 + 20 + Create a `fuzz/` directory at the root of your project: 21 + 22 + ``` 23 + project/ 24 + โ”œโ”€โ”€ lib/ 25 + โ”œโ”€โ”€ fuzz/ 26 + โ”‚ โ”œโ”€โ”€ dune 27 + โ”‚ โ””โ”€โ”€ fuzz_mylib.ml 28 + โ””โ”€โ”€ dune-project 29 + ``` 30 + 31 + ### 3. Add Dune Configuration 32 + 33 + Create `fuzz/dune`: 34 + 35 + ```lisp 36 + (executable 37 + (name fuzz_mylib) 38 + (libraries mylib crowbar)) 39 + ``` 40 + 41 + For multiple fuzz targets: 42 + 43 + ```lisp 44 + (executable 45 + (name fuzz_parser) 46 + (libraries mylib crowbar)) 47 + 48 + (executable 49 + (name fuzz_encoder) 50 + (libraries mylib crowbar)) 51 + ``` 52 + 53 + ### 4. Write Fuzz Tests 54 + 55 + See [Test Structure](#test-structure) below for patterns and examples. 56 + 57 + ### 5. Run Tests 58 + 59 + **QuickCheck mode** (no AFL required): 60 + ```bash 61 + dune exec ./fuzz/fuzz_mylib.exe 62 + ``` 63 + 64 + **AFL mode** (requires AFL instrumentation): 65 + ```bash 66 + # Build with AFL instrumentation 67 + dune build --context afl 68 + 69 + # Run AFL fuzzer 70 + mkdir -p input_corpus output 71 + echo "seed" > input_corpus/seed 72 + afl-fuzz -i input_corpus -o output -- _build/afl/fuzz/fuzz_mylib.exe @@ 73 + ``` 74 + 75 + ## Test Structure 76 + 77 + ### Basic Test Pattern 78 + 79 + ```ocaml 80 + open Crowbar 81 + 82 + (* Property: operation shouldn't crash on arbitrary input *) 83 + let () = 84 + add_test ~name:"mylib_no_crash" [ bytes ] @@ fun input -> 85 + match MyLib.parse input with 86 + | Ok _ | Error _ -> check true 87 + 88 + (* Property: roundtrip encode/decode *) 89 + let () = 90 + add_test ~name:"mylib_roundtrip" [ int ] @@ fun n -> 91 + let encoded = MyLib.encode n in 92 + match MyLib.decode encoded with 93 + | Error e -> failwith ("roundtrip failed: " ^ e) 94 + | Ok decoded -> check (decoded = n) 95 + ``` 96 + 97 + ### Crowbar Generators 98 + 99 + Crowbar provides generators for creating test inputs: 100 + 101 + ```ocaml 102 + open Crowbar 103 + 104 + (* Primitive generators *) 105 + let _ = bool (* bool *) 106 + let _ = int (* int (small range in quickcheck mode) *) 107 + let _ = int8 (* 0-255 *) 108 + let _ = int32 (* int32 *) 109 + let _ = int64 (* int64 *) 110 + let _ = float (* float *) 111 + let _ = bytes (* arbitrary byte string *) 112 + let _ = bytes_fixed 10 (* exactly 10 bytes *) 113 + 114 + (* Range generator *) 115 + let _ = range 100 (* 0 to 99 *) 116 + 117 + (* Choice from constants *) 118 + let _ = choose [ 119 + const "hello"; 120 + const "world"; 121 + const ""; 122 + ] 123 + 124 + (* Mapping over generators *) 125 + let positive_int = map [int] (fun n -> abs n) 126 + 127 + (* Optional values *) 128 + let maybe_int = option int (* int option *) 129 + 130 + (* Lists *) 131 + let int_list = list int (* int list *) 132 + 133 + (* Pairs and tuples *) 134 + let pair = pair int bytes (* int * bytes *) 135 + ``` 136 + 137 + ### Common Test Patterns 138 + 139 + #### 1. Crash Resistance (Robustness) 140 + 141 + Test that arbitrary input doesn't cause crashes: 142 + 143 + ```ocaml 144 + let () = 145 + add_test ~name:"parser_no_crash" [ bytes ] @@ fun input -> 146 + (* Should never raise an exception *) 147 + let _ = try Parser.parse input with _ -> [] in 148 + check true 149 + ``` 150 + 151 + #### 2. Roundtrip Property 152 + 153 + Test encode/decode symmetry: 154 + 155 + ```ocaml 156 + let () = 157 + add_test ~name:"codec_roundtrip" [ int ] @@ fun value -> 158 + match encode value with 159 + | Error e -> failwith ("encode failed: " ^ e) 160 + | Ok encoded -> 161 + match decode encoded with 162 + | Error e -> failwith ("decode failed: " ^ e) 163 + | Ok decoded -> check (decoded = value) 164 + ``` 165 + 166 + #### 3. Idempotence 167 + 168 + Test that applying operation twice equals applying once: 169 + 170 + ```ocaml 171 + let () = 172 + add_test ~name:"normalize_idempotent" [ bytes ] @@ fun input -> 173 + let once = normalize input in 174 + let twice = normalize once in 175 + check (once = twice) 176 + ``` 177 + 178 + #### 4. Known Edge Cases 179 + 180 + Test specific problematic inputs: 181 + 182 + ```ocaml 183 + let () = 184 + add_test ~name:"empty_input" [ const () ] @@ fun () -> 185 + match parse "" with 186 + | Ok _ | Error _ -> check true 187 + 188 + let () = 189 + add_test ~name:"boundary_values" 190 + [ choose [ 191 + const 0; 192 + const (-1); 193 + const max_int; 194 + const min_int; 195 + ] ] 196 + @@ fun n -> 197 + match process n with 198 + | Ok _ | Error _ -> check true 199 + ``` 200 + 201 + #### 5. Type Mismatch Handling 202 + 203 + Test graceful handling of wrong types: 204 + 205 + ```ocaml 206 + let () = 207 + add_test ~name:"type_mismatch" [ bytes ] @@ fun input -> 208 + (* Parsing arbitrary bytes as int should fail gracefully *) 209 + match decode_int input with 210 + | Ok _ -> check true (* unexpected success is fine *) 211 + | Error _ -> check true (* expected failure *) 212 + ``` 213 + 214 + #### 6. Comparison with Reference Implementation 215 + 216 + ```ocaml 217 + let () = 218 + add_test ~name:"matches_reference" [ int; int ] @@ fun a b -> 219 + let result = MyLib.add a b in 220 + let expected = a + b in 221 + check (result = expected) 222 + ``` 223 + 224 + ### Handling Expected Failures 225 + 226 + When some inputs are expected to fail: 227 + 228 + ```ocaml 229 + (* Large integers may exceed safe precision *) 230 + let max_safe_int = 9007199254740991 231 + 232 + let () = 233 + add_test ~name:"large_int_handling" [ range max_int ] @@ fun n -> 234 + match encode n with 235 + | Error _ -> 236 + (* Large ints beyond safe precision correctly rejected *) 237 + check (n > max_safe_int) 238 + | Ok encoded -> 239 + match decode encoded with 240 + | Error _ -> check (n > max_safe_int) 241 + | Ok decoded -> check (decoded = n) 242 + ``` 243 + 244 + ### Float Comparison 245 + 246 + Use epsilon comparison for floating-point roundtrips: 247 + 248 + ```ocaml 249 + let () = 250 + add_test ~name:"float_roundtrip" 251 + [ choose [ const 0.0; const 3.14; const (-2.5) ] ] 252 + @@ fun f -> 253 + match encode f |> Result.bind decode with 254 + | Error e -> failwith e 255 + | Ok decoded -> 256 + (* Use epsilon comparison for floats *) 257 + check (Float.abs (decoded -. f) < 0.0001) 258 + ``` 259 + 260 + ## AFL Fuzzing Setup 261 + 262 + ### Prerequisites 263 + 264 + - OCaml compiler with AFL support 265 + - AFL or AFL++ installed 266 + - Dune 3.0+ 267 + 268 + ### Dune Workspace Configuration 269 + 270 + Create or update `dune-workspace` at project root: 271 + 272 + ```lisp 273 + (lang dune 3.20) 274 + 275 + (context default) 276 + 277 + (context 278 + (default 279 + (name afl) 280 + (profile afl))) 281 + 282 + (env 283 + (afl 284 + (ocamlopt_flags (:standard -afl-instrument)))) 285 + ``` 286 + 287 + This creates an `afl` build context that instruments binaries for coverage-guided fuzzing. 288 + 289 + ### Building for AFL 290 + 291 + ```bash 292 + dune build --context afl 293 + ``` 294 + 295 + Instrumented binaries are output to `_build/afl/`. 296 + 297 + ### Running AFL 298 + 299 + ```bash 300 + # Create input corpus with seed files 301 + mkdir -p input_corpus 302 + echo '{"key": "value"}' > input_corpus/json1 303 + echo 'key: value' > input_corpus/yaml1 304 + 305 + # Run AFL fuzzer 306 + afl-fuzz -i input_corpus -o findings -- _build/afl/fuzz/fuzz_mylib.exe @@ 307 + ``` 308 + 309 + ### Common AFL Issues 310 + 311 + #### Core Pattern Error 312 + 313 + ``` 314 + AFL: Oops, your system is configured to send core dump notifications to an external utility... 315 + ``` 316 + 317 + **Fix:** 318 + ```bash 319 + sudo su -c 'echo core > /proc/sys/kernel/core_pattern' 320 + ``` 321 + 322 + #### CPU Scaling Error 323 + 324 + ``` 325 + AFL: The CPU frequency scaling governor is set to "powersave"... 326 + ``` 327 + 328 + **Fix:** 329 + ```bash 330 + sudo cpufreq-set -g performance 331 + # Or use AFL_SKIP_CPUFREQ=1 to ignore (not recommended) 332 + ``` 333 + 334 + ## Complete Example 335 + 336 + Here's a complete fuzz test file for a YAML library: 337 + 338 + ```ocaml 339 + (** Property-based tests for MyYaml using Crowbar *) 340 + 341 + open Crowbar 342 + 343 + (* Helper functions *) 344 + let decode codec s = 345 + let reader = Bytesrw.Bytes.Reader.of_string s in 346 + MyYaml.decode codec reader 347 + 348 + let encode codec v = 349 + let buf = Buffer.create 256 in 350 + let writer = Bytesrw.Bytes.Writer.of_buffer buf in 351 + match MyYaml.encode codec v writer with 352 + | Ok () -> Ok (Buffer.contents buf) 353 + | Error e -> Error e 354 + 355 + (* Test: Arbitrary input shouldn't crash *) 356 + let () = 357 + add_test ~name:"yaml_no_crash" [ bytes ] @@ fun input -> 358 + match decode Codec.string input with 359 + | Ok _ | Error _ -> check true 360 + 361 + (* Test: String roundtrip *) 362 + let () = 363 + add_test ~name:"yaml_string_roundtrip" 364 + [ choose [ const "hello"; const ""; const "with\nnewline" ] ] 365 + @@ fun s -> 366 + match encode Codec.string s with 367 + | Error e -> failwith ("encode: " ^ e) 368 + | Ok yaml -> 369 + match decode Codec.string yaml with 370 + | Error e -> failwith ("decode: " ^ e) 371 + | Ok decoded -> check (decoded = s) 372 + 373 + (* Test: Int roundtrip *) 374 + let () = 375 + add_test ~name:"yaml_int_roundtrip" [ range 1000000 ] @@ fun n -> 376 + match encode Codec.int n with 377 + | Error e -> failwith ("encode: " ^ e) 378 + | Ok yaml -> 379 + match decode Codec.int yaml with 380 + | Error e -> failwith ("decode: " ^ e) 381 + | Ok decoded -> check (decoded = n) 382 + 383 + (* Test: Bool roundtrip *) 384 + let () = 385 + add_test ~name:"yaml_bool_roundtrip" [ bool ] @@ fun b -> 386 + match encode Codec.bool b with 387 + | Error e -> failwith ("encode: " ^ e) 388 + | Ok yaml -> 389 + match decode Codec.bool yaml with 390 + | Error e -> failwith ("decode: " ^ e) 391 + | Ok decoded -> check (decoded = b) 392 + 393 + (* Test: List roundtrip *) 394 + let () = 395 + add_test ~name:"yaml_list_roundtrip" 396 + [ choose [ const []; const [1;2;3]; const [0] ] ] 397 + @@ fun lst -> 398 + let codec = Codec.list Codec.int in 399 + match encode codec lst with 400 + | Error e -> failwith ("encode: " ^ e) 401 + | Ok yaml -> 402 + match decode codec yaml with 403 + | Error e -> failwith ("decode: " ^ e) 404 + | Ok decoded -> check (decoded = lst) 405 + 406 + (* Test: Empty input *) 407 + let () = 408 + add_test ~name:"yaml_empty" [ const () ] @@ fun () -> 409 + match decode Codec.string "" with 410 + | Ok _ | Error _ -> check true 411 + 412 + (* Test: Nested structures *) 413 + let () = 414 + add_test ~name:"yaml_nested" 415 + [ choose [ const [[]]; const [[1;2];[3;4]] ] ] 416 + @@ fun nested -> 417 + let codec = Codec.list (Codec.list Codec.int) in 418 + match encode codec nested with 419 + | Error e -> failwith ("encode: " ^ e) 420 + | Ok yaml -> 421 + match decode codec yaml with 422 + | Error e -> failwith ("decode: " ^ e) 423 + | Ok decoded -> check (decoded = nested) 424 + ``` 425 + 426 + ## Tips for Writing Effective Fuzz Tests 427 + 428 + 1. **Start with crash resistance** - The most basic test ensures arbitrary input doesn't crash your code. 429 + 430 + 2. **Test roundtrips** - If you have encode/decode pairs, roundtrip tests catch many bugs. 431 + 432 + 3. **Use `choose` for edge cases** - Combine random generation with known problematic values. 433 + 434 + 4. **Keep generators focused** - Generate only valid inputs for properties that require them. 435 + 436 + 5. **Handle expected failures gracefully** - Use pattern matching to distinguish expected vs unexpected failures. 437 + 438 + 6. **Test at boundaries** - Empty inputs, maximum values, and type boundaries often reveal bugs. 439 + 440 + 7. **Run QuickCheck mode in CI** - It's fast and catches many issues without AFL setup. 441 + 442 + 8. **Use AFL for deep testing** - AFL's coverage guidance finds edge cases that random testing misses. 443 + 444 + ## Bugs Found by This Approach 445 + 446 + Example bugs discovered through fuzz testing: 447 + 448 + - **Empty string encoding** - Empty strings weren't properly quoted, causing decode failures 449 + - **Empty array encoding** - Empty arrays `[]` weren't using flow style, breaking roundtrips 450 + - **Large integer precision** - Integers > 2^53-1 lost precision in float-based formats 451 + - **Option type decoding** - Nested type combinators weren't being unwrapped during scalar decoding 452 + - **Special character handling** - Strings with newlines or special YAML characters needed escaping 453 + 454 + ### HTML Parser Bugs (html5rw) 455 + 456 + Fuzz testing html5rw discovered these serialization bugs: 457 + 458 + - **Raw text element double-escaping** - Elements like `<script>`, `<style>`, `<xmp>`, `<iframe>`, `<noembed>`, `<noframes>`, `<noscript>` were having their content HTML-escaped during serialization, which broke roundtrips (e.g., `<` becoming `&lt;` inside script tags) 459 + - **Escapable raw text handling** - Elements `<textarea>` and `<title>` need special treatment: only `&` needs escaping, not `<` or `>` 460 + - **Plaintext element serialization** - The `<plaintext>` element cannot be closed in HTML5, so serialization must stop after emitting its content 461 + 462 + ## AFL++ Compatibility Notes 463 + 464 + When using AFL++ (modern fork of AFL) with OCaml: 465 + 466 + ### Crowbar Persistent Mode Issues 467 + 468 + Crowbar's built-in AFL support may not work with AFL++. The `@@` file-based mode with Crowbar often results in "No instrumentation detected" errors even when instrumentation is present. 469 + 470 + **Solution**: Create a separate AFL fuzzer using `afl-persistent` directly: 471 + 472 + ```ocaml 473 + (* fuzz_afl.ml - Simple AFL-compatible fuzzer *) 474 + 475 + let fuzz_input input = 476 + try 477 + (* Your fuzzing logic here *) 478 + let result = MyLib.parse input in 479 + let serialized = MyLib.serialize result in 480 + (* Check properties, assert on failures *) 481 + if not (some_property serialized) then 482 + assert false; 483 + true 484 + with 485 + | Assert_failure _ -> raise (Assert_failure ("", 0, 0)) 486 + | _ -> true (* Expected failures for malformed input *) 487 + 488 + let read_file filename = 489 + let ic = open_in_bin filename in 490 + let n = in_channel_length ic in 491 + let s = really_input_string ic n in 492 + close_in ic; 493 + s 494 + 495 + let () = 496 + AflPersistent.run (fun () -> 497 + if Array.length Sys.argv < 2 then begin 498 + Printf.eprintf "Usage: %s <input_file>\n" Sys.argv.(0); 499 + exit 1 500 + end; 501 + let input = read_file Sys.argv.(1) in 502 + ignore (fuzz_input input) 503 + ) 504 + ``` 505 + 506 + Add `afl-persistent` to dependencies in dune: 507 + 508 + ```lisp 509 + (executable 510 + (name fuzz_afl) 511 + (libraries mylib afl-persistent)) 512 + ``` 513 + 514 + ### Building with AFL Instrumentation 515 + 516 + Use `-x afl` flag (not `--context afl`): 517 + 518 + ```bash 519 + dune build -x afl ./fuzz/fuzz_afl.exe 520 + ``` 521 + 522 + ### Environment Variables for AFL++ 523 + 524 + ```bash 525 + # Suppress core_pattern warning (when you can't modify /proc/sys/kernel/core_pattern) 526 + export AFL_I_DONT_CARE_ABOUT_MISSING_CRASHES=1 527 + 528 + # Skip CPU frequency scaling check 529 + export AFL_SKIP_CPUFREQ=1 530 + 531 + # Run AFL 532 + afl-fuzz -i input_corpus -o output -- _build/afl/fuzz/fuzz_afl.exe @@ 533 + ``` 534 + 535 + ### Verifying Instrumentation 536 + 537 + Use `afl-showmap` to verify instrumentation is detected: 538 + 539 + ```bash 540 + afl-showmap -o /tmp/map -- _build/afl/fuzz/fuzz_afl.exe /tmp/test_input 541 + cat /tmp/map | head -20 # Should show tuples like "000062:3" 542 + ``` 543 + 544 + A healthy instrumentation should capture 200+ tuples for a non-trivial program. 545 + 546 + ## Creating Effective Seed Corpus 547 + 548 + For HTML parsers, include diverse constructs: 549 + 550 + ```bash 551 + mkdir -p fuzz/input_corpus 552 + 553 + # Basic structure 554 + echo '<html><head><title>Test</title></head><body><p>Hello</p></body></html>' > fuzz/input_corpus/basic.html 555 + 556 + # Raw text elements (critical for escaping bugs) 557 + echo '<script>if (a < b && c > d) {}</script>' > fuzz/input_corpus/script.html 558 + echo '<style>.foo { content: "<bar>" }</style>' > fuzz/input_corpus/style.html 559 + 560 + # Escapable raw text elements 561 + echo '<textarea>Text with <html> tags & entities</textarea>' > fuzz/input_corpus/textarea.html 562 + 563 + # Tables (complex nesting rules) 564 + echo '<table><tr><td>Cell</td></tr></table>' > fuzz/input_corpus/table.html 565 + 566 + # Forms 567 + echo '<form><input type="text"><button>Submit</button></form>' > fuzz/input_corpus/form.html 568 + 569 + # SVG/MathML (foreign content) 570 + echo '<svg><circle r="50"/></svg>' > fuzz/input_corpus/svg.html 571 + echo '<math><mrow><mi>x</mi></mrow></math>' > fuzz/input_corpus/math.html 572 + 573 + # Entities 574 + echo '<p>&amp; &lt; &gt; &nbsp;</p>' > fuzz/input_corpus/entities.html 575 + 576 + # Template elements 577 + echo '<template><div>Content</div></template>' > fuzz/input_corpus/template.html 578 + 579 + # Malformed HTML (parser recovery) 580 + echo '<p>Unclosed<div>Mixed</p></div>' > fuzz/input_corpus/malformed.html 581 + 582 + # Comments 583 + echo '<!--comment-->Text<!---->More' > fuzz/input_corpus/comment.html 584 + ``` 585 + 586 + ## Recommended Test Properties for Parsers 587 + 588 + ### 1. Roundtrip Stabilization 589 + 590 + The most effective parser test: parse โ†’ serialize โ†’ parse โ†’ serialize should produce identical output on the second iteration: 591 + 592 + ```ocaml 593 + let () = 594 + add_test ~name:"roundtrip_stabilizes" [ bytes ] @@ fun input -> 595 + let r1 = parse input in 596 + let s1 = serialize r1 in 597 + let r2 = parse s1 in 598 + let s2 = serialize r2 in 599 + let r3 = parse s2 in 600 + let s3 = serialize r3 in 601 + (* First roundtrip may differ, but second must stabilize *) 602 + check (s2 = s3) 603 + ``` 604 + 605 + ### 2. Clone Consistency 606 + 607 + For DOM-like structures with clone operations: 608 + 609 + ```ocaml 610 + let () = 611 + add_test ~name:"clone_identical" [ bytes ] @@ fun input -> 612 + let doc = parse input in 613 + let root = get_root doc in 614 + let cloned = clone ~deep:true root in 615 + check (to_html root = to_html cloned) 616 + ``` 617 + 618 + ### 3. Error Consistency 619 + 620 + Parse errors should be deterministic: 621 + 622 + ```ocaml 623 + let () = 624 + add_test ~name:"errors_consistent" [ bytes ] @@ fun input -> 625 + let r1 = parse input in 626 + let r2 = parse input in 627 + check (get_errors r1 = get_errors r2) 628 + ``` 629 + 630 + ## Workflow for Fuzz-Driven Bug Fixing 631 + 632 + 1. **Run QuickCheck mode first** - Fast iteration, catches obvious bugs 633 + ```bash 634 + dune exec ./fuzz/fuzz_mylib.exe 635 + ``` 636 + 637 + 2. **Fix bugs, re-run** - Iterate until QuickCheck passes 638 + 639 + 3. **Run AFL for deeper coverage** - 5-10 minutes initially 640 + ```bash 641 + dune build -x afl ./fuzz/fuzz_afl.exe 642 + AFL_I_DONT_CARE_ABOUT_MISSING_CRASHES=1 AFL_SKIP_CPUFREQ=1 \ 643 + timeout 600 afl-fuzz -i fuzz/input_corpus -o fuzz/output -- \ 644 + _build/afl/fuzz/fuzz_afl.exe @@ 645 + ``` 646 + 647 + 4. **Check for crashes** - Examine `fuzz/output/default/crashes/` 648 + ```bash 649 + ls fuzz/output/default/crashes/ 650 + # Reproduce crash: 651 + _build/afl/fuzz/fuzz_afl.exe fuzz/output/default/crashes/id:000000,... 652 + ``` 653 + 654 + 5. **Fix and repeat** - Continue until no crashes found after extended runs 655 + 656 + 6. **Measure coverage** - Track corpus growth and edge coverage 657 + ```bash 658 + cat fuzz/output/default/fuzzer_stats | grep -E "(execs_done|corpus_count|saved_crashes)" 659 + ```
+1 -1
bin/dune
··· 2 (name html5check) 3 (public_name html5check) 4 (package html5rw) 5 - (libraries htmlrw_check html5rw bytesrw cmdliner unix))
··· 2 (name html5check) 3 (public_name html5check) 4 (package html5rw) 5 + (libraries htmlrw_check html5rw bytesrw cmdliner))
+20 -2
dune-project
··· 1 - (lang dune 3.21) 2 (name html5rw) 3 4 (generate_opam_files true) 5 6 (license MIT) 7 (authors "Anil Madhavapeddy <anil@recoil.org>") 8 (maintainers "Anil Madhavapeddy <anil@recoil.org>") 9 - (source (tangled anil.recoil.org/ocaml-html5rw)) 10 11 (package 12 (name html5rw) ··· 28 (jsont (>= 0.2.0)) 29 (cmdliner (>= 1.3.0)))) 30
··· 1 + (lang dune 3.20) 2 + 3 (name html5rw) 4 5 (generate_opam_files true) 6 7 (license MIT) 8 (authors "Anil Madhavapeddy <anil@recoil.org>") 9 + (homepage "https://tangled.org/@anil.recoil.org/ocaml-html5rw") 10 (maintainers "Anil Madhavapeddy <anil@recoil.org>") 11 + (bug_reports "https://tangled.org/@anil.recoil.org/ocaml-html5rw/issues") 12 + (maintenance_intent "(latest)") 13 14 (package 15 (name html5rw) ··· 31 (jsont (>= 0.2.0)) 32 (cmdliner (>= 1.3.0)))) 33 34 + (package 35 + (name html5rw-js) 36 + (allow_empty) 37 + (synopsis "Browser-based HTML5 parser via js_of_ocaml/wasm_of_ocaml") 38 + (description 39 + "JavaScript and WebAssembly builds of the html5rw HTML5 parser for browser use. \ 40 + Includes a main validator library, web worker for background validation, and \ 41 + browser-based test runner.") 42 + (depends 43 + (ocaml (>= 5.1.0)) 44 + (html5rw (= :version)) 45 + (js_of_ocaml (>= 5.0)) 46 + (js_of_ocaml-ppx (>= 5.0)) 47 + (wasm_of_ocaml-compiler (>= 5.0)) 48 + (brr (>= 0.0.6))))
+12
dune-workspace
···
··· 1 + (lang dune 3.20) 2 + 3 + (context default) 4 + 5 + (context 6 + (default 7 + (name afl) 8 + (profile afl))) 9 + 10 + (env 11 + (afl 12 + (ocamlopt_flags (:standard -afl-instrument))))
+8 -8
examples/dune
··· 1 (executable 2 (name basic_parsing) 3 (modules basic_parsing) 4 - (libraries bytesrw html5rw unix)) 5 6 (executable 7 (name css_selectors) 8 (modules css_selectors) 9 - (libraries bytesrw html5rw unix)) 10 11 (executable 12 (name dom_manipulation) 13 (modules dom_manipulation) 14 - (libraries bytesrw html5rw unix)) 15 16 (executable 17 (name text_extraction) 18 (modules text_extraction) 19 - (libraries bytesrw html5rw unix)) 20 21 (executable 22 (name error_handling) 23 (modules error_handling) 24 - (libraries bytesrw html5rw unix)) 25 26 (executable 27 (name fragment_parsing) 28 (modules fragment_parsing) 29 - (libraries bytesrw html5rw unix)) 30 31 (executable 32 (name encoding_detection) 33 (modules encoding_detection) 34 - (libraries bytesrw html5rw unix)) 35 36 (executable 37 (name web_scraper) 38 (modules web_scraper) 39 - (libraries bytesrw html5rw unix))
··· 1 (executable 2 (name basic_parsing) 3 (modules basic_parsing) 4 + (libraries bytesrw html5rw)) 5 6 (executable 7 (name css_selectors) 8 (modules css_selectors) 9 + (libraries bytesrw html5rw)) 10 11 (executable 12 (name dom_manipulation) 13 (modules dom_manipulation) 14 + (libraries bytesrw html5rw)) 15 16 (executable 17 (name text_extraction) 18 (modules text_extraction) 19 + (libraries bytesrw html5rw)) 20 21 (executable 22 (name error_handling) 23 (modules error_handling) 24 + (libraries bytesrw html5rw)) 25 26 (executable 27 (name fragment_parsing) 28 (modules fragment_parsing) 29 + (libraries bytesrw html5rw)) 30 31 (executable 32 (name encoding_detection) 33 (modules encoding_detection) 34 + (libraries bytesrw html5rw)) 35 36 (executable 37 (name web_scraper) 38 (modules web_scraper) 39 + (libraries bytesrw html5rw))
+55
fuzz/dune
···
··· 1 + (executable 2 + (name fuzz_html5rw) 3 + (libraries bytesrw html5rw crowbar)) 4 + 5 + (executable 6 + (name fuzz_afl) 7 + (libraries bytesrw html5rw afl-persistent)) 8 + 9 + (executable (name test_crash) (libraries bytesrw html5rw)) 10 + (executable (name test_pre) (libraries bytesrw html5rw)) 11 + 12 + ; Property-based testing (AFL) 13 + (executable 14 + (name fuzz_properties) 15 + (libraries bytesrw html5rw str afl-persistent)) 16 + 17 + ; Structure-aware fuzzer (AFL) 18 + (executable 19 + (name fuzz_structure) 20 + (libraries bytesrw html5rw str)) 21 + 22 + ; Resource exhaustion tests 23 + (executable 24 + (name fuzz_exhaustion) 25 + (libraries bytesrw html5rw unix)) 26 + 27 + ; Error recovery tests 28 + (executable 29 + (name fuzz_error_recovery) 30 + (libraries bytesrw html5rw)) 31 + 32 + ; Serializer-specific tests 33 + (executable 34 + (name fuzz_serializer) 35 + (libraries bytesrw html5rw str)) 36 + 37 + ; Streaming/incremental tests 38 + (executable 39 + (name fuzz_streaming) 40 + (libraries bytesrw html5rw)) 41 + 42 + ; Encoding tests 43 + (executable 44 + (name fuzz_encoding) 45 + (libraries bytesrw html5rw)) 46 + 47 + ; Fragment parsing tests 48 + (executable 49 + (name fuzz_fragment) 50 + (libraries bytesrw html5rw)) 51 + 52 + ; Security/sanitizer tests 53 + (executable 54 + (name fuzz_security) 55 + (libraries bytesrw html5rw str))
+92
fuzz/fuzz_afl.ml
···
··· 1 + (** Simple AFL-compatible fuzzer for html5rw 2 + 3 + This fuzzer reads input from a file (passed as command line argument) 4 + and runs several property tests on it. It uses AflPersistent for 5 + efficient AFL fuzzing. 6 + *) 7 + 8 + (* Helper to create a bytes reader from a string *) 9 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 10 + 11 + (* Serialize a parse result to string *) 12 + let serialize result = 13 + Html5rw.to_string ~pretty:false result 14 + 15 + (* Main fuzzing function - returns true if test passes *) 16 + let fuzz_input input = 17 + try 18 + (* Test 1: Parse should not crash *) 19 + let result = Html5rw.parse (reader_of_string input) in 20 + 21 + (* Test 2: Serialization should not crash *) 22 + let serialized = serialize result in 23 + 24 + (* Test 3: Reparse should not crash *) 25 + let result2 = Html5rw.parse (reader_of_string serialized) in 26 + let serialized2 = serialize result2 in 27 + 28 + (* Test 4: Roundtrip should stabilize (s2 == s3) *) 29 + let result3 = Html5rw.parse (reader_of_string serialized2) in 30 + let serialized3 = serialize result3 in 31 + 32 + if serialized2 <> serialized3 then begin 33 + Printf.eprintf "ROUNDTRIP BUG:\n"; 34 + Printf.eprintf "Input: %s\n" (String.escaped (String.sub input 0 (min 100 (String.length input)))); 35 + Printf.eprintf "s2: %s\n" (String.escaped (String.sub serialized2 0 (min 100 (String.length serialized2)))); 36 + Printf.eprintf "s3: %s\n" (String.escaped (String.sub serialized3 0 (min 100 (String.length serialized3)))); 37 + (* Signal a bug to AFL by aborting *) 38 + assert false 39 + end; 40 + 41 + (* Test 5: Text extraction should not crash *) 42 + let _ = Html5rw.to_text result in 43 + 44 + (* Test 6: Clone should produce identical output *) 45 + let root = Html5rw.root result in 46 + let cloned = Html5rw.clone ~deep:true root in 47 + let original_html = Html5rw.Dom.to_html ~pretty:false root in 48 + let cloned_html = Html5rw.Dom.to_html ~pretty:false cloned in 49 + 50 + if original_html <> cloned_html then begin 51 + Printf.eprintf "CLONE BUG:\n"; 52 + Printf.eprintf "Original: %s\n" (String.escaped (String.sub original_html 0 (min 100 (String.length original_html)))); 53 + Printf.eprintf "Cloned: %s\n" (String.escaped (String.sub cloned_html 0 (min 100 (String.length cloned_html)))); 54 + assert false 55 + end; 56 + 57 + (* Test 7: Selector queries should not crash (test a few common patterns) *) 58 + let selectors = ["*"; "div"; ".class"; "#id"; "div > p"; "[attr]"] in 59 + List.iter (fun sel -> 60 + try 61 + let _ = Html5rw.query result sel in () 62 + with Html5rw.Selector.Selector_error _ -> () 63 + ) selectors; 64 + 65 + true 66 + with 67 + | Assert_failure _ -> 68 + (* Re-raise assert failures so AFL sees the crash *) 69 + raise (Assert_failure ("", 0, 0)) 70 + | _ -> 71 + (* Other exceptions are expected for malformed input *) 72 + true 73 + 74 + (* Read file contents *) 75 + let read_file filename = 76 + let ic = open_in_bin filename in 77 + let n = in_channel_length ic in 78 + let s = really_input_string ic n in 79 + close_in ic; 80 + s 81 + 82 + (* Main entry point *) 83 + let () = 84 + (* Use AflPersistent for efficient AFL fuzzing *) 85 + AflPersistent.run (fun () -> 86 + if Array.length Sys.argv < 2 then begin 87 + Printf.eprintf "Usage: %s <input_file>\n" Sys.argv.(0); 88 + exit 1 89 + end; 90 + let input = read_file Sys.argv.(1) in 91 + ignore (fuzz_input input) 92 + )
+260
fuzz/fuzz_encoding.ml
···
··· 1 + (* Encoding fuzzer for HTML5rw 2 + Tests UTF-8 handling, BOM, surrogates, and encoding edge cases *) 3 + 4 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 5 + 6 + (* Test helper: parse and serialize, check for crashes *) 7 + let test_encoding input = 8 + try 9 + let r = Html5rw.parse (reader_of_string input) in 10 + let _ = Html5rw.to_string ~pretty:false r in 11 + true 12 + with _ -> 13 + false 14 + 15 + (* Test helper: check roundtrip stability *) 16 + let test_roundtrip input = 17 + try 18 + let r1 = Html5rw.parse (reader_of_string input) in 19 + let s1 = Html5rw.to_string ~pretty:false r1 in 20 + let r2 = Html5rw.parse (reader_of_string s1) in 21 + let s2 = Html5rw.to_string ~pretty:false r2 in 22 + let r3 = Html5rw.parse (reader_of_string s2) in 23 + let s3 = Html5rw.to_string ~pretty:false r3 in 24 + s2 = s3 25 + with _ -> false 26 + 27 + (* UTF-8 BOM *) 28 + let bom_cases = [| 29 + "\xEF\xBB\xBF"; (* Just BOM *) 30 + "\xEF\xBB\xBF<!DOCTYPE html>"; (* BOM + DOCTYPE *) 31 + "\xEF\xBB\xBF<div>test</div>"; (* BOM + content *) 32 + "\xEF\xBB\xBF\xEF\xBB\xBF"; (* Double BOM *) 33 + "<div>\xEF\xBB\xBF</div>"; (* BOM in content - should be preserved as text *) 34 + |] 35 + 36 + (* Valid UTF-8 sequences *) 37 + let valid_utf8_cases = [| 38 + (* 1-byte (ASCII) *) 39 + "hello"; 40 + "<div>test</div>"; 41 + 42 + (* 2-byte sequences *) 43 + "\xC2\xA0"; (* NBSP *) 44 + "\xC3\xA9"; (* รฉ *) 45 + "caf\xC3\xA9"; (* cafรฉ *) 46 + "\xC2\xAB\xC2\xBB"; (* ยซ ยป *) 47 + 48 + (* 3-byte sequences *) 49 + "\xE2\x80\x93"; (* en-dash *) 50 + "\xE2\x80\x94"; (* em-dash *) 51 + "\xE2\x80\x99"; (* right single quote *) 52 + "\xE2\x80\x9C\xE2\x80\x9D"; (* curly quotes *) 53 + "\xE4\xB8\xAD\xE6\x96\x87"; (* ไธญๆ–‡ *) 54 + "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"; (* ๆ—ฅๆœฌ่ชž *) 55 + 56 + (* 4-byte sequences (emoji, etc.) *) 57 + "\xF0\x9F\x98\x80"; (* ๐Ÿ˜€ *) 58 + "\xF0\x9F\x8E\x89"; (* ๐ŸŽ‰ *) 59 + "\xF0\x9D\x94\xB8"; (* ๐”ธ mathematical double-struck *) 60 + 61 + (* Mixed *) 62 + "<div>\xC3\xA9\xE2\x80\x93\xF0\x9F\x98\x80</div>"; 63 + "<span title=\"\xC3\xA9\">text</span>"; 64 + |] 65 + 66 + (* Invalid UTF-8 sequences (should be handled gracefully) *) 67 + let invalid_utf8_cases = [| 68 + (* Lone continuation bytes *) 69 + "\x80"; 70 + "\xBF"; 71 + "\x80\x80\x80"; 72 + 73 + (* Overlong sequences *) 74 + "\xC0\x80"; (* Overlong NUL *) 75 + "\xE0\x80\x80"; (* Overlong NUL (3-byte) *) 76 + "\xF0\x80\x80\x80"; (* Overlong NUL (4-byte) *) 77 + "\xC0\xAF"; (* Overlong / *) 78 + "\xC1\xBF"; (* Overlong DEL *) 79 + 80 + (* Truncated sequences *) 81 + "\xC2"; (* Start of 2-byte, missing continuation *) 82 + "\xE0\x80"; (* Start of 3-byte, missing continuation *) 83 + "\xF0\x80\x80"; (* Start of 4-byte, missing continuation *) 84 + 85 + (* Invalid start bytes *) 86 + "\xFE"; 87 + "\xFF"; 88 + "\xFE\xFF"; (* UTF-16 BE BOM as bytes *) 89 + "\xFF\xFE"; (* UTF-16 LE BOM as bytes *) 90 + 91 + (* Surrogate pairs (invalid in UTF-8) *) 92 + "\xED\xA0\x80"; (* High surrogate U+D800 *) 93 + "\xED\xBF\xBF"; (* Low surrogate U+DFFF *) 94 + "\xED\xA0\x80\xED\xB0\x80"; (* Surrogate pair (should be single 4-byte) *) 95 + 96 + (* Out of range *) 97 + "\xF4\x90\x80\x80"; (* U+110000, beyond Unicode *) 98 + "\xF7\xBF\xBF\xBF"; (* U+1FFFFF, way beyond *) 99 + 100 + (* Invalid sequence in tag *) 101 + "<div\x80>"; 102 + "<div class=\"\x80\">"; 103 + "</\x80div>"; 104 + 105 + (* Invalid in attribute value *) 106 + "<div data-x=\"\xC0\xAF\">"; 107 + "<div title=\"\xED\xA0\x80\">"; 108 + |] 109 + 110 + (* Control characters *) 111 + let control_char_cases = [| 112 + (* NUL *) 113 + "\x00"; 114 + "<div>\x00</div>"; 115 + "<div attr=\"\x00\">"; 116 + 117 + (* Other C0 controls *) 118 + "\x01\x02\x03"; 119 + "<div>\x08</div>"; (* backspace *) 120 + "<div>\x0B</div>"; (* vertical tab *) 121 + "<div>\x0C</div>"; (* form feed *) 122 + 123 + (* C1 controls (as UTF-8) *) 124 + "\xC2\x80"; (* U+0080 *) 125 + "\xC2\x9F"; (* U+009F *) 126 + 127 + (* DEL *) 128 + "\x7F"; 129 + 130 + (* Mixed with valid content *) 131 + "<div>hello\x00world</div>"; 132 + "<div>test\x01\x02\x03</div>"; 133 + |] 134 + 135 + (* Unicode special characters *) 136 + let special_unicode_cases = [| 137 + (* Zero-width characters *) 138 + "\xE2\x80\x8B"; (* ZWSP U+200B *) 139 + "\xE2\x80\x8C"; (* ZWNJ U+200C *) 140 + "\xE2\x80\x8D"; (* ZWJ U+200D *) 141 + "\xEF\xBB\xBF"; (* BOM/ZWNBSP U+FEFF *) 142 + 143 + (* Replacement character *) 144 + "\xEF\xBF\xBD"; (* U+FFFD *) 145 + 146 + (* Byte order marks and special noncharacters *) 147 + "\xEF\xBF\xBE"; (* U+FFFE - noncharacter *) 148 + "\xEF\xBF\xBF"; (* U+FFFF - noncharacter *) 149 + 150 + (* Private use area *) 151 + "\xEE\x80\x80"; (* U+E000 *) 152 + "\xEF\xA3\xBF"; (* U+F8FF *) 153 + 154 + (* RTL and BiDi *) 155 + "\xE2\x80\x8F"; (* RLM U+200F *) 156 + "\xE2\x80\xAE"; (* RLO U+202E *) 157 + "\xE2\x80\xAC"; (* PDF U+202C *) 158 + 159 + (* Combining characters *) 160 + "e\xCC\x81"; (* e + combining acute = รฉ *) 161 + "a\xCC\x80\xCC\x81\xCC\x82"; (* Multiple combining marks *) 162 + 163 + (* In HTML context *) 164 + "<div>\xE2\x80\x8B</div>"; 165 + "<span title=\"\xE2\x80\x8F\">"; 166 + |] 167 + 168 + (* Numeric character references *) 169 + let ncr_cases = [| 170 + (* Valid decimal *) 171 + "&#65;"; (* A *) 172 + "&#169;"; (* ยฉ *) 173 + "&#8212;"; (* em-dash *) 174 + "&#128512;"; (* ๐Ÿ˜€ *) 175 + 176 + (* Valid hex *) 177 + "&#x41;"; 178 + "&#xA9;"; 179 + "&#x2014;"; 180 + "&#x1F600;"; 181 + 182 + (* Edge cases *) 183 + "&#0;"; (* NUL - should become replacement *) 184 + "&#x0;"; 185 + "&#127;"; (* DEL *) 186 + "&#128;"; (* C1 control *) 187 + "&#159;"; (* Last C1 control *) 188 + 189 + (* Surrogates (should be replaced) *) 190 + "&#xD800;"; 191 + "&#xDFFF;"; 192 + "&#55296;"; (* D800 decimal *) 193 + 194 + (* Noncharacters *) 195 + "&#xFFFE;"; 196 + "&#xFFFF;"; 197 + "&#x1FFFE;"; 198 + 199 + (* Beyond Unicode *) 200 + "&#x110000;"; 201 + "&#1114112;"; 202 + 203 + (* Very large numbers *) 204 + "&#999999999;"; 205 + "&#xFFFFFFFFF;"; 206 + 207 + (* Invalid formats *) 208 + "&#;"; 209 + "&#x;"; 210 + "&#xGHI;"; 211 + "&#abc;"; 212 + |] 213 + 214 + let run_test_category name cases test_fn = 215 + let passed = ref 0 in 216 + let failed = ref 0 in 217 + Array.iter (fun input -> 218 + if test_fn input then 219 + incr passed 220 + else begin 221 + Printf.printf " FAIL: %s\n" 222 + (String.escaped (String.sub input 0 (min 40 (String.length input)))); 223 + incr failed 224 + end 225 + ) cases; 226 + Printf.printf "%s: %d/%d\n" name !passed (Array.length cases); 227 + !failed = 0 228 + 229 + let () = 230 + Printf.printf "=== Encoding Tests ===\n\n"; 231 + 232 + let all_pass = ref true in 233 + 234 + (* Test basic handling (no exceptions) *) 235 + Printf.printf "--- Crash resistance tests ---\n"; 236 + if not (run_test_category "BOM handling" bom_cases test_encoding) then 237 + all_pass := false; 238 + if not (run_test_category "Valid UTF-8" valid_utf8_cases test_encoding) then 239 + all_pass := false; 240 + if not (run_test_category "Invalid UTF-8" invalid_utf8_cases test_encoding) then 241 + all_pass := false; 242 + if not (run_test_category "Control characters" control_char_cases test_encoding) then 243 + all_pass := false; 244 + if not (run_test_category "Special Unicode" special_unicode_cases test_encoding) then 245 + all_pass := false; 246 + if not (run_test_category "Numeric character refs" ncr_cases test_encoding) then 247 + all_pass := false; 248 + 249 + (* Test roundtrip stability for valid cases *) 250 + Printf.printf "\n--- Roundtrip stability tests ---\n"; 251 + if not (run_test_category "Valid UTF-8 roundtrip" valid_utf8_cases test_roundtrip) then 252 + all_pass := false; 253 + 254 + Printf.printf "\n=== Summary ===\n"; 255 + if !all_pass then 256 + Printf.printf "All encoding tests passed!\n" 257 + else begin 258 + Printf.printf "Some encoding tests failed!\n"; 259 + exit 1 260 + end
+224
fuzz/fuzz_error_recovery.ml
···
··· 1 + (* Error recovery fuzzer for HTML5rw 2 + Tests the parser's ability to handle and recover from various error conditions *) 3 + 4 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 5 + 6 + (* Category 1: Unclosed tags *) 7 + let unclosed_tag_cases = [| 8 + "<div>"; 9 + "<div><span>"; 10 + "<div><div><div>"; 11 + "<p><p><p>"; 12 + "<a href='x'><b><i>"; 13 + "<table><tr><td>"; 14 + "<ul><li><li><li>"; 15 + "<select><option><option>"; 16 + "<dl><dt><dd>"; 17 + |] 18 + 19 + (* Category 2: Misnested tags *) 20 + let misnested_cases = [| 21 + "<a><b></a></b>"; 22 + "<b><i></b></i>"; 23 + "<p><div></p></div>"; 24 + "<a href='x'><div></a></div>"; 25 + "<em><strong></em></strong>"; 26 + "<b><i><u></b></i></u>"; 27 + "<table><div></table></div>"; 28 + "<span><div></span></div>"; 29 + |] 30 + 31 + (* Category 3: Invalid tag names *) 32 + let invalid_tag_cases = [| 33 + "<123>"; 34 + "<!>"; 35 + "<>"; 36 + "< >"; 37 + "<\t>"; 38 + "<\n>"; 39 + "<%>"; 40 + "<&>"; 41 + "<div<>"; 42 + "<a<b>"; 43 + |] 44 + 45 + (* Category 4: Invalid attributes *) 46 + let invalid_attr_cases = [| 47 + "<div =value>"; 48 + "<div 123=value>"; 49 + "<div a=b=c>"; 50 + "<div a='b\"c>"; 51 + "<div a=\"b'c>"; 52 + "<div a=<b>"; 53 + "<div a=>>"; 54 + "<div a b c>"; 55 + "<div onclick=\"<script>\">"; 56 + |] 57 + 58 + (* Category 5: Premature EOF *) 59 + let premature_eof_cases = [| 60 + "<div"; 61 + "<div attr"; 62 + "<div attr="; 63 + "<div attr='val"; 64 + "<div attr=\"val"; 65 + "<!DOCTYPE"; 66 + "<!DOCTYPE html"; 67 + "<!--"; 68 + "<!-- comment"; 69 + "<![CDATA["; 70 + "<script>alert("; 71 + "<style>.foo {"; 72 + |] 73 + 74 + (* Category 6: Invalid nesting (semantic) *) 75 + let semantic_nesting_cases = [| 76 + "<a><a></a></a>"; 77 + "<form><form></form></form>"; 78 + "<button><button></button></button>"; 79 + "<p><p></p></p>"; 80 + "<li><li></li></li>"; 81 + "<dt><dt></dt></dt>"; 82 + "<table><table></table></table>"; 83 + "<select><select></select></select>"; 84 + |] 85 + 86 + (* Category 7: Foreign content errors *) 87 + let foreign_content_cases = [| 88 + "<svg><div></svg>"; 89 + "<math><span></math>"; 90 + "<svg><foreignObject><svg></foreignObject></svg>"; 91 + "<svg><title><svg></title></svg>"; 92 + "<math><annotation-xml><math></annotation-xml></math>"; 93 + "<svg><script>x</svg>"; 94 + "<svg><style>.x{}</svg>"; 95 + |] 96 + 97 + (* Category 8: DOCTYPE errors *) 98 + let doctype_cases = [| 99 + "<!DOCTYPE>"; 100 + "<!DOCTYPE html PUBLIC>"; 101 + "<!DOCTYPE html SYSTEM>"; 102 + "<!DOCTYPE html PUBLIC \"\" \"\">"; 103 + "<!doctypehtml>"; 104 + "<!DOCTYPEhtml>"; 105 + "<!doctype\nhtml>"; 106 + "<!doctype\thtml>"; 107 + "<!doctype html><!doctype html>"; 108 + |] 109 + 110 + (* Category 9: Comment errors *) 111 + let comment_cases = [| 112 + "<!---->"; 113 + "<!--->"; 114 + "<!--->"; 115 + "<!--a--!>"; 116 + "<!--a--!-->"; 117 + "<!----!>"; 118 + "<!--<script>-->"; 119 + "<!--<!---->"; 120 + "<!->"; 121 + |] 122 + 123 + (* Category 10: Entity errors *) 124 + let entity_cases = [| 125 + "&;"; 126 + "&xyz;"; 127 + "&amp"; 128 + "&#;"; 129 + "&#x;"; 130 + "&#xGHI;"; 131 + "&#99999999999;"; 132 + "&nosuchentity;"; 133 + "&#xFFFFFFFF;"; 134 + "&#0;"; 135 + |] 136 + 137 + (* Category 11: Script/style content *) 138 + let rawtext_cases = [| 139 + "<script></script></script>"; 140 + "<script><!--</script>--></script>"; 141 + "<script><![CDATA[</script>]]></script>"; 142 + "<style></style></style>"; 143 + "<textarea></textarea></textarea>"; 144 + "<title></title></title>"; 145 + "<script>x</script"; 146 + "<xmp></xmp></xmp>"; 147 + |] 148 + 149 + (* Category 12: Table structure errors *) 150 + let table_cases = [| 151 + "<table><div>x</div></table>"; 152 + "<table><tr></table>"; 153 + "<table><td></table>"; 154 + "<table><caption><table></caption></table>"; 155 + "<table><tbody><td></table>"; 156 + "<table><thead><th></table>"; 157 + "<table><col></table>"; 158 + "<table><tr><th><td></table>"; 159 + "<tr><td>x</td></tr>"; 160 + |] 161 + 162 + let test_error_recovery input category = 163 + try 164 + let r1 = Html5rw.parse (reader_of_string input) in 165 + let s1 = Html5rw.to_string ~pretty:false r1 in 166 + let r2 = Html5rw.parse (reader_of_string s1) in 167 + let s2 = Html5rw.to_string ~pretty:false r2 in 168 + let r3 = Html5rw.parse (reader_of_string s2) in 169 + let s3 = Html5rw.to_string ~pretty:false r3 in 170 + if s2 <> s3 then begin 171 + Printf.printf "ROUNDTRIP UNSTABLE [%s]:\n" category; 172 + Printf.printf " Input: %s\n" (String.escaped input); 173 + Printf.printf " S2: %s\n" (String.escaped (String.sub s2 0 (min 100 (String.length s2)))); 174 + Printf.printf " S3: %s\n" (String.escaped (String.sub s3 0 (min 100 (String.length s3)))); 175 + false 176 + end else 177 + true 178 + with e -> 179 + Printf.printf "EXCEPTION [%s]: %s\n" category (Printexc.to_string e); 180 + Printf.printf " Input: %s\n" (String.escaped input); 181 + false 182 + 183 + let run_category name cases = 184 + let passed = ref 0 in 185 + let failed = ref 0 in 186 + Array.iter (fun input -> 187 + if test_error_recovery input name then 188 + incr passed 189 + else 190 + incr failed 191 + ) cases; 192 + Printf.printf "%s: %d/%d passed\n" name !passed (Array.length cases); 193 + !failed = 0 194 + 195 + let () = 196 + let all_pass = ref true in 197 + 198 + let categories = [ 199 + ("unclosed_tags", unclosed_tag_cases); 200 + ("misnested", misnested_cases); 201 + ("invalid_tags", invalid_tag_cases); 202 + ("invalid_attrs", invalid_attr_cases); 203 + ("premature_eof", premature_eof_cases); 204 + ("semantic_nesting", semantic_nesting_cases); 205 + ("foreign_content", foreign_content_cases); 206 + ("doctype", doctype_cases); 207 + ("comments", comment_cases); 208 + ("entities", entity_cases); 209 + ("rawtext", rawtext_cases); 210 + ("tables", table_cases); 211 + ] in 212 + 213 + List.iter (fun (name, cases) -> 214 + if not (run_category name cases) then 215 + all_pass := false 216 + ) categories; 217 + 218 + Printf.printf "\n=== Summary ===\n"; 219 + if !all_pass then 220 + Printf.printf "All error recovery tests passed!\n" 221 + else begin 222 + Printf.printf "Some error recovery tests failed!\n"; 223 + exit 1 224 + end
+256
fuzz/fuzz_exhaustion.ml
···
··· 1 + (* Resource exhaustion tests for HTML5rw 2 + Tests for algorithmic complexity bugs, memory issues, and DoS vectors *) 3 + 4 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 5 + 6 + (* Timing helper *) 7 + let time_it f = 8 + let start = Unix.gettimeofday () in 9 + let result = f () in 10 + let elapsed = Unix.gettimeofday () -. start in 11 + (result, elapsed) 12 + 13 + (* Test 1: Deeply nested elements *) 14 + (* Note: Deep nesting can exhibit O(nยฒ) complexity in tree construction. 15 + The timing thresholds are set to catch severe regressions while allowing 16 + for some expected slowdown with very deep nesting. *) 17 + let test_deep_nesting depth = 18 + let input = String.concat "" (List.init depth (fun _ -> "<div>")) in 19 + let (_, elapsed) = time_it (fun () -> 20 + try 21 + let r = Html5rw.parse (reader_of_string input) in 22 + let _ = Html5rw.to_string r in 23 + true 24 + with _ -> false 25 + ) in 26 + (* Allow quadratic behavior up to a reasonable limit for very deep nesting. 27 + HTML5 spec allows implementations to impose nesting limits for DoS protection. *) 28 + let max_time = 29 + if depth <= 1000 then float depth *. 0.001 +. 1.0 30 + else float depth *. 0.02 +. 30.0 (* Very lenient for extreme depth - known O(nยฒ) case *) 31 + in 32 + if elapsed > max_time then begin 33 + Printf.printf "SLOW: deep_nesting(%d) took %.3fs (max %.3fs)\n" depth elapsed max_time; 34 + false 35 + end else 36 + true 37 + 38 + (* Test 2: Wide trees (many siblings) *) 39 + let test_wide_tree width = 40 + let children = String.concat "" (List.init width (fun i -> Printf.sprintf "<span>%d</span>" i)) in 41 + let input = "<div>" ^ children ^ "</div>" in 42 + let (_, elapsed) = time_it (fun () -> 43 + try 44 + let r = Html5rw.parse (reader_of_string input) in 45 + let _ = Html5rw.to_string r in 46 + true 47 + with _ -> false 48 + ) in 49 + let max_time = float width *. 0.0001 +. 0.5 in 50 + if elapsed > max_time then begin 51 + Printf.printf "SLOW: wide_tree(%d) took %.3fs (max %.3fs)\n" width elapsed max_time; 52 + false 53 + end else 54 + true 55 + 56 + (* Test 3: Huge text nodes *) 57 + let test_huge_text size = 58 + let text = String.make size 'x' in 59 + let input = "<div>" ^ text ^ "</div>" in 60 + let (_, elapsed) = time_it (fun () -> 61 + try 62 + let r = Html5rw.parse (reader_of_string input) in 63 + let _ = Html5rw.to_string r in 64 + true 65 + with _ -> false 66 + ) in 67 + let max_time = float size *. 0.00001 +. 0.5 in 68 + if elapsed > max_time then begin 69 + Printf.printf "SLOW: huge_text(%d) took %.3fs (max %.3fs)\n" size elapsed max_time; 70 + false 71 + end else 72 + true 73 + 74 + (* Test 4: Many attributes *) 75 + let test_many_attrs count = 76 + let attrs = String.concat " " (List.init count (fun i -> Printf.sprintf "a%d=\"v%d\"" i i)) in 77 + let input = Printf.sprintf "<div %s></div>" attrs in 78 + let (_, elapsed) = time_it (fun () -> 79 + try 80 + let r = Html5rw.parse (reader_of_string input) in 81 + let _ = Html5rw.to_string r in 82 + true 83 + with _ -> false 84 + ) in 85 + let max_time = float count *. 0.0001 +. 0.5 in 86 + if elapsed > max_time then begin 87 + Printf.printf "SLOW: many_attrs(%d) took %.3fs (max %.3fs)\n" count elapsed max_time; 88 + false 89 + end else 90 + true 91 + 92 + (* Test 5: Huge attribute values *) 93 + let test_huge_attr_value size = 94 + let value = String.make size 'x' in 95 + let input = Printf.sprintf "<div data-x=\"%s\"></div>" value in 96 + let (_, elapsed) = time_it (fun () -> 97 + try 98 + let r = Html5rw.parse (reader_of_string input) in 99 + let _ = Html5rw.to_string r in 100 + true 101 + with _ -> false 102 + ) in 103 + let max_time = float size *. 0.00001 +. 0.5 in 104 + if elapsed > max_time then begin 105 + Printf.printf "SLOW: huge_attr_value(%d) took %.3fs (max %.3fs)\n" size elapsed max_time; 106 + false 107 + end else 108 + true 109 + 110 + (* Test 6: Repeated unclosed p tags (adoption agency stress test) *) 111 + let test_repeated_p count = 112 + let input = String.concat "" (List.init count (fun _ -> "<p>")) in 113 + let (_, elapsed) = time_it (fun () -> 114 + try 115 + let r = Html5rw.parse (reader_of_string input) in 116 + let _ = Html5rw.to_string r in 117 + true 118 + with _ -> false 119 + ) in 120 + (* This could trigger O(n^2) behavior in naive implementations *) 121 + let max_time = float count *. 0.001 +. 1.0 in 122 + if elapsed > max_time then begin 123 + Printf.printf "SLOW: repeated_p(%d) took %.3fs (max %.3fs)\n" count elapsed max_time; 124 + false 125 + end else 126 + true 127 + 128 + (* Test 7: Nested formatting elements (adoption agency stress) *) 129 + let test_nested_formatting depth = 130 + let tags = [| "a"; "b"; "i"; "em"; "strong" |] in 131 + let open_tags = String.concat "" (List.init depth (fun i -> "<" ^ tags.(i mod 5) ^ ">")) in 132 + let input = open_tags ^ "text" in 133 + let (_, elapsed) = time_it (fun () -> 134 + try 135 + let r = Html5rw.parse (reader_of_string input) in 136 + let _ = Html5rw.to_string r in 137 + true 138 + with _ -> false 139 + ) in 140 + let max_time = float depth *. 0.001 +. 0.5 in 141 + if elapsed > max_time then begin 142 + Printf.printf "SLOW: nested_formatting(%d) took %.3fs (max %.3fs)\n" depth elapsed max_time; 143 + false 144 + end else 145 + true 146 + 147 + (* Test 8: Table with many cells *) 148 + let test_large_table rows cols = 149 + let cells = String.concat "" (List.init cols (fun _ -> "<td>x</td>")) in 150 + let row = "<tr>" ^ cells ^ "</tr>" in 151 + let tbody = String.concat "" (List.init rows (fun _ -> row)) in 152 + let input = "<table><tbody>" ^ tbody ^ "</tbody></table>" in 153 + let (_, elapsed) = time_it (fun () -> 154 + try 155 + let r = Html5rw.parse (reader_of_string input) in 156 + let _ = Html5rw.to_string r in 157 + true 158 + with _ -> false 159 + ) in 160 + let total = rows * cols in 161 + let max_time = float total *. 0.0001 +. 1.0 in 162 + if elapsed > max_time then begin 163 + Printf.printf "SLOW: large_table(%dx%d) took %.3fs (max %.3fs)\n" rows cols elapsed max_time; 164 + false 165 + end else 166 + true 167 + 168 + (* Test 9: Deeply nested tables *) 169 + let test_nested_tables depth = 170 + let rec make_table d = 171 + if d = 0 then "x" 172 + else "<table><tr><td>" ^ make_table (d - 1) ^ "</td></tr></table>" 173 + in 174 + let input = make_table depth in 175 + let (_, elapsed) = time_it (fun () -> 176 + try 177 + let r = Html5rw.parse (reader_of_string input) in 178 + let _ = Html5rw.to_string r in 179 + true 180 + with _ -> false 181 + ) in 182 + let max_time = float depth *. 0.01 +. 0.5 in 183 + if elapsed > max_time then begin 184 + Printf.printf "SLOW: nested_tables(%d) took %.3fs (max %.3fs)\n" depth elapsed max_time; 185 + false 186 + end else 187 + true 188 + 189 + (* Test 10: Many entity references *) 190 + let test_many_entities count = 191 + let entities = String.concat "" (List.init count (fun _ -> "&amp;")) in 192 + let input = "<div>" ^ entities ^ "</div>" in 193 + let (_, elapsed) = time_it (fun () -> 194 + try 195 + let r = Html5rw.parse (reader_of_string input) in 196 + let _ = Html5rw.to_string r in 197 + true 198 + with _ -> false 199 + ) in 200 + let max_time = float count *. 0.0001 +. 0.5 in 201 + if elapsed > max_time then begin 202 + Printf.printf "SLOW: many_entities(%d) took %.3fs (max %.3fs)\n" count elapsed max_time; 203 + false 204 + end else 205 + true 206 + 207 + (* Run all exhaustion tests *) 208 + let run_all_tests () = 209 + let tests = [ 210 + ("deep_nesting_100", fun () -> test_deep_nesting 100); 211 + ("deep_nesting_1000", fun () -> test_deep_nesting 1000); 212 + ("deep_nesting_5000", fun () -> test_deep_nesting 5000); 213 + ("wide_tree_100", fun () -> test_wide_tree 100); 214 + ("wide_tree_1000", fun () -> test_wide_tree 1000); 215 + ("wide_tree_10000", fun () -> test_wide_tree 10000); 216 + ("huge_text_10000", fun () -> test_huge_text 10000); 217 + ("huge_text_100000", fun () -> test_huge_text 100000); 218 + ("many_attrs_100", fun () -> test_many_attrs 100); 219 + ("many_attrs_1000", fun () -> test_many_attrs 1000); 220 + ("huge_attr_10000", fun () -> test_huge_attr_value 10000); 221 + ("huge_attr_100000", fun () -> test_huge_attr_value 100000); 222 + ("repeated_p_100", fun () -> test_repeated_p 100); 223 + ("repeated_p_500", fun () -> test_repeated_p 500); 224 + ("nested_formatting_50", fun () -> test_nested_formatting 50); 225 + ("nested_formatting_200", fun () -> test_nested_formatting 200); 226 + ("large_table_10x10", fun () -> test_large_table 10 10); 227 + ("large_table_100x100", fun () -> test_large_table 100 100); 228 + ("nested_tables_10", fun () -> test_nested_tables 10); 229 + ("nested_tables_50", fun () -> test_nested_tables 50); 230 + ("many_entities_1000", fun () -> test_many_entities 1000); 231 + ("many_entities_10000", fun () -> test_many_entities 10000); 232 + ] in 233 + 234 + let passed = ref 0 in 235 + let failed = ref 0 in 236 + 237 + List.iter (fun (name, test) -> 238 + Printf.printf "Running %s... %!" name; 239 + if test () then begin 240 + Printf.printf "PASS\n%!"; 241 + incr passed 242 + end else begin 243 + Printf.printf "FAIL\n%!"; 244 + incr failed 245 + end 246 + ) tests; 247 + 248 + Printf.printf "\n=== Summary ===\n"; 249 + Printf.printf "Passed: %d\n" !passed; 250 + Printf.printf "Failed: %d\n" !failed; 251 + 252 + !failed = 0 253 + 254 + let () = 255 + if not (run_all_tests ()) then 256 + exit 1
+261
fuzz/fuzz_fragment.ml
···
··· 1 + (* Fragment parsing fuzzer for HTML5rw 2 + Tests innerHTML-style fragment parsing with various context elements *) 3 + 4 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 5 + 6 + (* All context element types to test *) 7 + let html_contexts = [| 8 + "div"; "span"; "p"; "a"; "b"; "i"; "em"; "strong"; 9 + "ul"; "ol"; "li"; "dl"; "dt"; "dd"; 10 + "table"; "thead"; "tbody"; "tfoot"; "tr"; "th"; "td"; "caption"; 11 + "select"; "optgroup"; "option"; 12 + "form"; "fieldset"; "legend"; "label"; "input"; "button"; "textarea"; 13 + "pre"; "code"; "blockquote"; 14 + "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; 15 + "article"; "section"; "nav"; "aside"; "header"; "footer"; "main"; 16 + "figure"; "figcaption"; 17 + "template"; 18 + |] 19 + 20 + let svg_contexts = [| 21 + "svg"; "g"; "circle"; "rect"; "path"; "text"; "tspan"; 22 + "foreignObject"; "title"; "desc"; 23 + |] 24 + 25 + let math_contexts = [| 26 + "math"; "mi"; "mo"; "mn"; "ms"; "mrow"; 27 + "annotation-xml"; 28 + |] 29 + 30 + (* Test fragments for different contexts *) 31 + let general_fragments = [| 32 + "text content"; 33 + "<span>inline</span>"; 34 + "<div>block</div>"; 35 + "<a href=\"#\">link</a>"; 36 + "<b><i>nested</i></b>"; 37 + "text<br>text"; 38 + "<!-- comment -->"; 39 + "<img src=\"x\">"; 40 + |] 41 + 42 + let list_fragments = [| 43 + "<li>item</li>"; 44 + "<li>one</li><li>two</li>"; 45 + "text in list"; 46 + "<li><ul><li>nested</li></ul></li>"; 47 + |] 48 + 49 + let table_fragments = [| 50 + "<tr><td>cell</td></tr>"; 51 + "<td>cell</td>"; 52 + "<th>header</th>"; 53 + "text in table"; 54 + "<tbody><tr><td>x</td></tr></tbody>"; 55 + "<caption>title</caption>"; 56 + |] 57 + 58 + let select_fragments = [| 59 + "<option>opt</option>"; 60 + "<option value=\"1\">one</option><option value=\"2\">two</option>"; 61 + "<optgroup label=\"group\"><option>x</option></optgroup>"; 62 + "text in select"; 63 + |] 64 + 65 + let svg_fragments = [| 66 + "<circle cx=\"50\" cy=\"50\" r=\"40\"/>"; 67 + "<rect x=\"0\" y=\"0\" width=\"100\" height=\"100\"/>"; 68 + "<text>SVG text</text>"; 69 + "<g><circle r=\"10\"/></g>"; 70 + "<foreignObject><div>HTML in SVG</div></foreignObject>"; 71 + |] 72 + 73 + let math_fragments = [| 74 + "<mi>x</mi>"; 75 + "<mo>=</mo>"; 76 + "<mn>42</mn>"; 77 + "<mrow><mi>x</mi><mo>=</mo><mn>1</mn></mrow>"; 78 + |] 79 + 80 + (* Test parsing a fragment with a given context *) 81 + let test_fragment_parse ctx_tag ?namespace fragment = 82 + try 83 + let ctx = Html5rw.make_fragment_context ~tag_name:ctx_tag ?namespace () in 84 + let doc = Html5rw.parse ~fragment_context:ctx (reader_of_string fragment) in 85 + let _ = Html5rw.to_string ~pretty:false doc in 86 + true 87 + with e -> 88 + Printf.printf "Exception: %s\n" (Printexc.to_string e); 89 + Printf.printf " Context: <%s>\n" ctx_tag; 90 + Printf.printf " Fragment: %s\n" (String.escaped fragment); 91 + false 92 + 93 + (* Test roundtrip stability for fragment parsing *) 94 + let test_fragment_roundtrip ctx_tag ?namespace fragment = 95 + try 96 + let ctx = Html5rw.make_fragment_context ~tag_name:ctx_tag ?namespace () in 97 + let doc1 = Html5rw.parse ~fragment_context:ctx (reader_of_string fragment) in 98 + let s1 = Html5rw.to_string ~pretty:false doc1 in 99 + let doc2 = Html5rw.parse ~fragment_context:ctx (reader_of_string s1) in 100 + let s2 = Html5rw.to_string ~pretty:false doc2 in 101 + let doc3 = Html5rw.parse ~fragment_context:ctx (reader_of_string s2) in 102 + let s3 = Html5rw.to_string ~pretty:false doc3 in 103 + if s2 <> s3 then begin 104 + Printf.printf "Roundtrip mismatch:\n"; 105 + Printf.printf " Context: <%s>\n" ctx_tag; 106 + Printf.printf " S2: %s\n" (String.escaped s2); 107 + Printf.printf " S3: %s\n" (String.escaped s3); 108 + false 109 + end else 110 + true 111 + with _ -> false 112 + 113 + (* Compare fragment parsing with different contexts *) 114 + let test_context_sensitivity () = 115 + let test_cases = [| 116 + (* These should produce different results in different contexts *) 117 + ("<li>item</li>", [| "ul"; "ol"; "div"; "body" |]); 118 + ("<tr><td>x</td></tr>", [| "table"; "tbody"; "div"; "body" |]); 119 + ("<td>x</td>", [| "tr"; "table"; "div"; "body" |]); 120 + ("<option>x</option>", [| "select"; "optgroup"; "div"; "body" |]); 121 + ("<p>text</p>", [| "div"; "p"; "body" |]); 122 + |] in 123 + 124 + let all_ok = ref true in 125 + Array.iter (fun (fragment, contexts) -> 126 + Array.iter (fun ctx -> 127 + if not (test_fragment_parse ctx fragment) then begin 128 + Printf.printf "FAIL: <%s> with fragment: %s\n" ctx fragment; 129 + all_ok := false 130 + end 131 + ) contexts 132 + ) test_cases; 133 + !all_ok 134 + 135 + (* Run comprehensive tests *) 136 + let run_all_tests () = 137 + let all_pass = ref true in 138 + 139 + Printf.printf "=== Fragment Parsing Tests ===\n\n"; 140 + 141 + (* Test HTML contexts with general fragments *) 142 + Printf.printf "--- HTML contexts with general fragments ---\n"; 143 + let html_pass = ref 0 in 144 + let html_fail = ref 0 in 145 + Array.iter (fun ctx -> 146 + Array.iter (fun fragment -> 147 + if test_fragment_parse ctx fragment then 148 + incr html_pass 149 + else 150 + incr html_fail 151 + ) general_fragments 152 + ) html_contexts; 153 + Printf.printf "HTML contexts: %d/%d\n" !html_pass (!html_pass + !html_fail); 154 + if !html_fail > 0 then all_pass := false; 155 + 156 + (* Test list contexts *) 157 + Printf.printf "\n--- List contexts ---\n"; 158 + let list_pass = ref 0 in 159 + let list_fail = ref 0 in 160 + Array.iter (fun ctx -> 161 + Array.iter (fun fragment -> 162 + if test_fragment_parse ctx fragment then 163 + incr list_pass 164 + else 165 + incr list_fail 166 + ) list_fragments 167 + ) [| "ul"; "ol"; "menu" |]; 168 + Printf.printf "List contexts: %d/%d\n" !list_pass (!list_pass + !list_fail); 169 + if !list_fail > 0 then all_pass := false; 170 + 171 + (* Test table contexts *) 172 + Printf.printf "\n--- Table contexts ---\n"; 173 + let table_pass = ref 0 in 174 + let table_fail = ref 0 in 175 + Array.iter (fun ctx -> 176 + Array.iter (fun fragment -> 177 + if test_fragment_parse ctx fragment then 178 + incr table_pass 179 + else 180 + incr table_fail 181 + ) table_fragments 182 + ) [| "table"; "tbody"; "thead"; "tfoot"; "tr" |]; 183 + Printf.printf "Table contexts: %d/%d\n" !table_pass (!table_pass + !table_fail); 184 + if !table_fail > 0 then all_pass := false; 185 + 186 + (* Test select contexts *) 187 + Printf.printf "\n--- Select contexts ---\n"; 188 + let select_pass = ref 0 in 189 + let select_fail = ref 0 in 190 + Array.iter (fun ctx -> 191 + Array.iter (fun fragment -> 192 + if test_fragment_parse ctx fragment then 193 + incr select_pass 194 + else 195 + incr select_fail 196 + ) select_fragments 197 + ) [| "select"; "optgroup" |]; 198 + Printf.printf "Select contexts: %d/%d\n" !select_pass (!select_pass + !select_fail); 199 + if !select_fail > 0 then all_pass := false; 200 + 201 + (* Test SVG contexts *) 202 + Printf.printf "\n--- SVG contexts ---\n"; 203 + let svg_pass = ref 0 in 204 + let svg_fail = ref 0 in 205 + Array.iter (fun ctx -> 206 + Array.iter (fun fragment -> 207 + if test_fragment_parse ctx ~namespace:(Some "svg") fragment then 208 + incr svg_pass 209 + else 210 + incr svg_fail 211 + ) svg_fragments 212 + ) svg_contexts; 213 + Printf.printf "SVG contexts: %d/%d\n" !svg_pass (!svg_pass + !svg_fail); 214 + if !svg_fail > 0 then all_pass := false; 215 + 216 + (* Test Math contexts *) 217 + Printf.printf "\n--- Math contexts ---\n"; 218 + let math_pass = ref 0 in 219 + let math_fail = ref 0 in 220 + Array.iter (fun ctx -> 221 + Array.iter (fun fragment -> 222 + if test_fragment_parse ctx ~namespace:(Some "math") fragment then 223 + incr math_pass 224 + else 225 + incr math_fail 226 + ) math_fragments 227 + ) math_contexts; 228 + Printf.printf "Math contexts: %d/%d\n" !math_pass (!math_pass + !math_fail); 229 + if !math_fail > 0 then all_pass := false; 230 + 231 + (* Test context sensitivity *) 232 + Printf.printf "\n--- Context sensitivity ---\n"; 233 + if not (test_context_sensitivity ()) then 234 + all_pass := false 235 + else 236 + Printf.printf "Context sensitivity: OK\n"; 237 + 238 + (* Test roundtrip for a sample *) 239 + Printf.printf "\n--- Roundtrip stability ---\n"; 240 + let rt_pass = ref 0 in 241 + let rt_fail = ref 0 in 242 + Array.iter (fun ctx -> 243 + Array.iter (fun fragment -> 244 + if test_fragment_roundtrip ctx fragment then 245 + incr rt_pass 246 + else 247 + incr rt_fail 248 + ) general_fragments 249 + ) [| "div"; "span"; "ul"; "table"; "select" |]; 250 + Printf.printf "Roundtrip: %d/%d\n" !rt_pass (!rt_pass + !rt_fail); 251 + if !rt_fail > 0 then all_pass := false; 252 + 253 + Printf.printf "\n=== Summary ===\n"; 254 + if !all_pass then 255 + Printf.printf "All fragment parsing tests passed!\n" 256 + else begin 257 + Printf.printf "Some fragment parsing tests failed!\n"; 258 + exit 1 259 + end 260 + 261 + let () = run_all_tests ()
+537
fuzz/fuzz_html5rw.ml
···
··· 1 + (** Comprehensive fuzz tests for html5rw HTML5 parser using Crowbar *) 2 + 3 + open Crowbar 4 + 5 + (* Helper to create a bytes reader from a string *) 6 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 7 + 8 + (* ========================================================================== 9 + Generators for HTML-like content 10 + ========================================================================== *) 11 + 12 + (* Common tag names for structured generation *) 13 + let tag_names = [ 14 + "div"; "p"; "span"; "a"; "h1"; "h2"; "h3"; "ul"; "li"; "ol"; 15 + "table"; "tr"; "td"; "th"; "thead"; "tbody"; "form"; "input"; 16 + "button"; "select"; "option"; "textarea"; "label"; "img"; "br"; 17 + "hr"; "b"; "i"; "strong"; "em"; "code"; "pre"; "script"; "style"; 18 + "head"; "body"; "html"; "title"; "meta"; "link"; "nav"; "header"; 19 + "footer"; "main"; "section"; "article"; "aside"; "figure"; "svg"; 20 + "math"; "template"; "iframe"; "noscript"; "plaintext"; "xmp"; 21 + ] 22 + 23 + let tag_name_gen = choose (List.map const tag_names) 24 + 25 + (* Common attribute names *) 26 + let attr_names = [ 27 + "id"; "class"; "href"; "src"; "style"; "title"; "alt"; "name"; 28 + "type"; "value"; "data-foo"; "aria-label"; "onclick"; "onload"; 29 + ] 30 + 31 + let attr_name_gen = choose (List.map const attr_names) 32 + 33 + (* Generator for a simple attribute *) 34 + let attr_gen = 35 + map [attr_name_gen; bytes] (fun name value -> 36 + Printf.sprintf "%s=\"%s\"" name (String.escaped value)) 37 + 38 + (* Generator for attributes list *) 39 + let attrs_gen = list attr_gen 40 + 41 + (* Generator for a simple opening tag *) 42 + let start_tag_gen = 43 + map [tag_name_gen; attrs_gen] (fun tag attrs -> 44 + let attrs_str = String.concat " " attrs in 45 + if attrs_str = "" then Printf.sprintf "<%s>" tag 46 + else Printf.sprintf "<%s %s>" tag attrs_str) 47 + 48 + (* Generator for a simple closing tag *) 49 + let end_tag_gen = 50 + map [tag_name_gen] (fun tag -> Printf.sprintf "</%s>" tag) 51 + 52 + (* Generator for text content *) 53 + let text_content_gen = 54 + choose [ 55 + const ""; 56 + const "Hello"; 57 + const "Hello, world!"; 58 + const "Test <with> special &chars;"; 59 + bytes; 60 + ] 61 + 62 + (* Generator for comments - used in html_gen via malformed_html_gen *) 63 + let _comment_gen = 64 + map [bytes] (fun content -> 65 + Printf.sprintf "<!--%s-->" content) 66 + 67 + (* Generator for DOCTYPE *) 68 + let doctype_gen = 69 + choose [ 70 + const "<!DOCTYPE html>"; 71 + const "<!doctype html>"; 72 + const "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\">"; 73 + const "<!DOCTYPE>"; 74 + map [bytes] (fun s -> Printf.sprintf "<!DOCTYPE %s>" s); 75 + ] 76 + 77 + (* Generator for simple HTML fragments *) 78 + let simple_html_gen = 79 + map [start_tag_gen; text_content_gen; end_tag_gen] 80 + (fun start text _end_ -> start ^ text ^ _end_) 81 + 82 + (* Generator for nested HTML *) 83 + let nested_html_gen = 84 + map [start_tag_gen; simple_html_gen; end_tag_gen] 85 + (fun outer inner _end_ -> outer ^ inner ^ _end_) 86 + 87 + (* Generator for structured HTML with common patterns *) 88 + let structured_html_gen = 89 + choose [ 90 + const "<html><head><title>Test</title></head><body></body></html>"; 91 + const "<!DOCTYPE html><html><body><p>Hello</p></body></html>"; 92 + const "<div><span>text</span></div>"; 93 + const "<table><tr><td>cell</td></tr></table>"; 94 + const "<ul><li>item1</li><li>item2</li></ul>"; 95 + const "<form><input type=\"text\"><button>Submit</button></form>"; 96 + const "<p>First</p><p>Second</p>"; 97 + const "<div><div><div>nested</div></div></div>"; 98 + simple_html_gen; 99 + nested_html_gen; 100 + ] 101 + 102 + (* Generator for malformed/edge case HTML *) 103 + let malformed_html_gen = 104 + choose [ 105 + const "<"; 106 + const ">"; 107 + const "</"; 108 + const "<>"; 109 + const "<<>>"; 110 + const "<div"; 111 + const "<div>"; 112 + const "</div>"; 113 + const "<div><span>"; 114 + const "<div></span></div>"; 115 + const "<p><div></div></p>"; 116 + const "<!-"; 117 + const "<!--"; 118 + const "<!-->"; 119 + const "<!--->"; 120 + const "<!-- -- -->"; 121 + const "&"; 122 + const "&amp"; 123 + const "&amp;"; 124 + const "&#"; 125 + const "&#60"; 126 + const "&#60;"; 127 + const "&#x"; 128 + const "&#x3c;"; 129 + const "<script>alert('xss')</script>"; 130 + const "<style>body{}</style>"; 131 + const "<![CDATA[test]]>"; 132 + const "<?xml version=\"1.0\"?>"; 133 + const "<svg><foreignObject></foreignObject></svg>"; 134 + const "<math><mi>x</mi></math>"; 135 + const "<template><div>content</div></template>"; 136 + const "<table><div>misplaced</div><tr><td>ok</td></tr></table>"; 137 + map [bytes] (fun s -> "<" ^ s ^ ">"); 138 + map [bytes; bytes] (fun a b -> "<" ^ a ^ " " ^ b ^ ">"); 139 + ] 140 + 141 + (* Combined HTML generator *) 142 + let html_gen = 143 + choose [ 144 + bytes; (* Completely random *) 145 + structured_html_gen; (* Well-structured HTML *) 146 + malformed_html_gen; (* Known edge cases *) 147 + map [doctype_gen; structured_html_gen] (fun dt html -> dt ^ html); 148 + ] 149 + 150 + (* CSS selector generators *) 151 + let selector_gen = 152 + choose [ 153 + const "*"; 154 + const "div"; 155 + const "#id"; 156 + const ".class"; 157 + const "div.class"; 158 + const "div#id"; 159 + const "[attr]"; 160 + const "[attr=value]"; 161 + const "[attr~=value]"; 162 + const "[attr|=value]"; 163 + const "[attr^=value]"; 164 + const "[attr$=value]"; 165 + const "[attr*=value]"; 166 + const ":first-child"; 167 + const ":last-child"; 168 + const ":nth-child(1)"; 169 + const ":nth-child(2n+1)"; 170 + const ":only-child"; 171 + const ":empty"; 172 + const ":not(div)"; 173 + const "div > p"; 174 + const "div p"; 175 + const "div + p"; 176 + const "div ~ p"; 177 + const "div, p"; 178 + const "div > p.class#id[attr]:first-child"; 179 + bytes; (* Random selector to find crashes *) 180 + ] 181 + 182 + (* Fragment context tag names 183 + Note: raw text elements (script, style, textarea, title, xmp, iframe, etc.) 184 + are excluded because fragment content parsed in their context cannot 185 + round-trip correctly - the content is raw text but serialized without 186 + the element wrapper, so escaping behavior differs. *) 187 + let fragment_context_gen = 188 + choose [ 189 + const "div"; 190 + const "body"; 191 + const "html"; 192 + const "table"; 193 + const "tr"; 194 + const "tbody"; 195 + const "thead"; 196 + const "td"; 197 + const "th"; 198 + const "ul"; 199 + const "ol"; 200 + const "select"; 201 + const "template"; 202 + const "svg"; 203 + const "math"; 204 + (* Exclude raw text contexts: script, style, textarea, title *) 205 + ] 206 + 207 + (* ========================================================================== 208 + Test 1: Crash resistance - arbitrary input should not crash 209 + ========================================================================== *) 210 + 211 + let () = 212 + add_test ~name:"html5rw_no_crash_bytes" [bytes] @@ fun input -> 213 + let _ = 214 + try Html5rw.parse (reader_of_string input) 215 + with _ -> Html5rw.parse (reader_of_string "") 216 + in 217 + check true 218 + 219 + let () = 220 + add_test ~name:"html5rw_no_crash_html" [html_gen] @@ fun input -> 221 + let _ = 222 + try Html5rw.parse (reader_of_string input) 223 + with _ -> Html5rw.parse (reader_of_string "") 224 + in 225 + check true 226 + 227 + let () = 228 + add_test ~name:"html5rw_parse_bytes_no_crash" [bytes] @@ fun input -> 229 + let _ = 230 + try Html5rw.parse_bytes (Bytes.of_string input) 231 + with _ -> Html5rw.parse_bytes (Bytes.of_string "") 232 + in 233 + check true 234 + 235 + (* ========================================================================== 236 + Test 2: Roundtrip - parse -> serialize -> reparse should be consistent 237 + ========================================================================== *) 238 + 239 + (* Serialize a parse result to string *) 240 + let serialize result = 241 + Html5rw.to_string ~pretty:false result 242 + 243 + (* Compare two DOM trees structurally (text content of serialized output) *) 244 + let _trees_equivalent result1 result2 = 245 + let s1 = serialize result1 in 246 + let s2 = serialize result2 in 247 + s1 = s2 248 + 249 + let () = 250 + add_test ~name:"html5rw_roundtrip_idempotent" [html_gen] @@ fun input -> 251 + try 252 + (* Parse original *) 253 + let result1 = Html5rw.parse (reader_of_string input) in 254 + let serialized1 = serialize result1 in 255 + 256 + (* Reparse serialized output *) 257 + let result2 = Html5rw.parse (reader_of_string serialized1) in 258 + let serialized2 = serialize result2 in 259 + 260 + (* The second serialization should equal the first *) 261 + (* (First parse may normalize, but second should be stable) *) 262 + if serialized1 <> serialized2 then begin 263 + Printf.printf "\nRoundtrip mismatch:\n"; 264 + Printf.printf "Input: %s\n" (String.escaped (String.sub input 0 (min 200 (String.length input)))); 265 + Printf.printf "First: %s\n" (String.escaped (String.sub serialized1 0 (min 200 (String.length serialized1)))); 266 + Printf.printf "Second: %s\n" (String.escaped (String.sub serialized2 0 (min 200 (String.length serialized2)))); 267 + check false 268 + end else 269 + check true 270 + with e -> 271 + Printf.printf "\nRoundtrip exception: %s\n" (Printexc.to_string e); 272 + check false 273 + 274 + (* Additional roundtrip test: parse -> serialize -> reparse -> serialize should stabilize *) 275 + let () = 276 + add_test ~name:"html5rw_triple_roundtrip" [structured_html_gen] @@ fun input -> 277 + try 278 + let r1 = Html5rw.parse (reader_of_string input) in 279 + let s1 = serialize r1 in 280 + 281 + let r2 = Html5rw.parse (reader_of_string s1) in 282 + let s2 = serialize r2 in 283 + 284 + let r3 = Html5rw.parse (reader_of_string s2) in 285 + let s3 = serialize r3 in 286 + 287 + (* By the third roundtrip, output should be stable *) 288 + if s2 <> s3 then begin 289 + Printf.printf "\nTriple roundtrip not stable:\n"; 290 + Printf.printf "s2: %s\n" (String.escaped (String.sub s2 0 (min 200 (String.length s2)))); 291 + Printf.printf "s3: %s\n" (String.escaped (String.sub s3 0 (min 200 (String.length s3)))); 292 + check false 293 + end else 294 + check true 295 + with e -> 296 + Printf.printf "\nTriple roundtrip exception: %s\n" (Printexc.to_string e); 297 + check false 298 + 299 + (* ========================================================================== 300 + Test 3: Serialization idempotence 301 + ========================================================================== *) 302 + 303 + let () = 304 + add_test ~name:"html5rw_serialize_idempotent" [html_gen] @@ fun input -> 305 + try 306 + let result = Html5rw.parse (reader_of_string input) in 307 + let s1 = serialize result in 308 + let s2 = serialize result in 309 + if s1 <> s2 then begin 310 + Printf.printf "\nSerialization not idempotent!\n"; 311 + check false 312 + end else 313 + check true 314 + with e -> 315 + Printf.printf "\nSerialization exception: %s\n" (Printexc.to_string e); 316 + check false 317 + 318 + (* ========================================================================== 319 + Test 4: CSS Selector crash resistance 320 + ========================================================================== *) 321 + 322 + let () = 323 + add_test ~name:"html5rw_selector_no_crash" [selector_gen; html_gen] @@ fun selector html -> 324 + try 325 + let result = Html5rw.parse (reader_of_string html) in 326 + let _ = Html5rw.query result selector in 327 + check true 328 + with 329 + | Html5rw.Selector.Selector_error _ -> check true (* Expected for malformed selectors *) 330 + | e -> 331 + Printf.printf "\nUnexpected selector exception: %s\n" (Printexc.to_string e); 332 + Printf.printf "Selector: %s\n" (String.escaped selector); 333 + check false 334 + 335 + let () = 336 + add_test ~name:"html5rw_matches_no_crash" [selector_gen; html_gen] @@ fun selector html -> 337 + try 338 + let result = Html5rw.parse (reader_of_string html) in 339 + let root = Html5rw.root result in 340 + let _ = Html5rw.matches root selector in 341 + check true 342 + with 343 + | Html5rw.Selector.Selector_error _ -> check true 344 + | e -> 345 + Printf.printf "\nUnexpected matches exception: %s\n" (Printexc.to_string e); 346 + check false 347 + 348 + (* ========================================================================== 349 + Test 5: Fragment parsing 350 + ========================================================================== *) 351 + 352 + let () = 353 + add_test ~name:"html5rw_fragment_no_crash" [fragment_context_gen; html_gen] 354 + @@ fun ctx_tag html -> 355 + try 356 + let ctx = Html5rw.make_fragment_context ~tag_name:ctx_tag () in 357 + let _ = Html5rw.parse ~fragment_context:ctx (reader_of_string html) in 358 + check true 359 + with e -> 360 + Printf.printf "\nFragment parse exception with context '%s': %s\n" 361 + ctx_tag (Printexc.to_string e); 362 + check false 363 + 364 + (* Fragment roundtrip *) 365 + let () = 366 + add_test ~name:"html5rw_fragment_roundtrip" [fragment_context_gen; structured_html_gen] 367 + @@ fun ctx_tag html -> 368 + try 369 + let ctx = Html5rw.make_fragment_context ~tag_name:ctx_tag () in 370 + let r1 = Html5rw.parse ~fragment_context:ctx (reader_of_string html) in 371 + let s1 = serialize r1 in 372 + 373 + let r2 = Html5rw.parse ~fragment_context:ctx (reader_of_string s1) in 374 + let s2 = serialize r2 in 375 + 376 + let r3 = Html5rw.parse ~fragment_context:ctx (reader_of_string s2) in 377 + let s3 = serialize r3 in 378 + 379 + if s2 <> s3 then begin 380 + Printf.printf "\nFragment roundtrip not stable with context '%s'\n" ctx_tag; 381 + Printf.printf "Input: %s\n" (String.escaped (String.sub html 0 (min 100 (String.length html)))); 382 + Printf.printf "s1: %s\n" (String.escaped (String.sub s1 0 (min 100 (String.length s1)))); 383 + Printf.printf "s2: %s\n" (String.escaped (String.sub s2 0 (min 100 (String.length s2)))); 384 + Printf.printf "s3: %s\n" (String.escaped (String.sub s3 0 (min 100 (String.length s3)))); 385 + check false 386 + end else 387 + check true 388 + with e -> 389 + Printf.printf "\nFragment roundtrip exception: %s\n" (Printexc.to_string e); 390 + check false 391 + 392 + (* ========================================================================== 393 + Test 6: DOM manipulation consistency 394 + ========================================================================== *) 395 + 396 + let () = 397 + add_test ~name:"html5rw_dom_manipulation" [tag_name_gen; bytes] @@ fun tag text -> 398 + try 399 + (* Create element, add text, serialize, reparse *) 400 + let elem = Html5rw.create_element tag () in 401 + let text_node = Html5rw.create_text text in 402 + Html5rw.append_child elem text_node; 403 + 404 + (* Create a document to hold it *) 405 + let doc = Html5rw.create_document () in 406 + let html = Html5rw.create_element "html" () in 407 + let body = Html5rw.create_element "body" () in 408 + Html5rw.append_child doc html; 409 + Html5rw.append_child html body; 410 + Html5rw.append_child body elem; 411 + 412 + (* Serialize via Dom.to_html *) 413 + let serialized = Html5rw.Dom.to_html ~pretty:false doc in 414 + 415 + (* Reparse *) 416 + let result = Html5rw.parse (reader_of_string serialized) in 417 + let _ = Html5rw.to_string result in 418 + check true 419 + with e -> 420 + Printf.printf "\nDOM manipulation exception: %s\n" (Printexc.to_string e); 421 + check false 422 + 423 + (* ========================================================================== 424 + Test 7: Text extraction consistency 425 + ========================================================================== *) 426 + 427 + let () = 428 + add_test ~name:"html5rw_text_extraction" [html_gen] @@ fun html -> 429 + try 430 + let result = Html5rw.parse (reader_of_string html) in 431 + let _ = Html5rw.to_text result in 432 + check true 433 + with e -> 434 + Printf.printf "\nText extraction exception: %s\n" (Printexc.to_string e); 435 + check false 436 + 437 + (* ========================================================================== 438 + Test 8: Clone consistency 439 + ========================================================================== *) 440 + 441 + let () = 442 + add_test ~name:"html5rw_clone_deep" [html_gen] @@ fun html -> 443 + try 444 + let result = Html5rw.parse (reader_of_string html) in 445 + let root = Html5rw.root result in 446 + let cloned = Html5rw.clone ~deep:true root in 447 + 448 + (* Serialize both and compare *) 449 + let original_html = Html5rw.Dom.to_html ~pretty:false root in 450 + let cloned_html = Html5rw.Dom.to_html ~pretty:false cloned in 451 + 452 + if original_html <> cloned_html then begin 453 + Printf.printf "\nClone mismatch:\n"; 454 + Printf.printf "Original: %s\n" (String.escaped (String.sub original_html 0 (min 200 (String.length original_html)))); 455 + Printf.printf "Cloned: %s\n" (String.escaped (String.sub cloned_html 0 (min 200 (String.length cloned_html)))); 456 + check false 457 + end else 458 + check true 459 + with e -> 460 + Printf.printf "\nClone exception: %s\n" (Printexc.to_string e); 461 + check false 462 + 463 + (* ========================================================================== 464 + Test 9: Error collection should not affect parsing result 465 + ========================================================================== *) 466 + 467 + let () = 468 + add_test ~name:"html5rw_error_collection_consistent" [html_gen] @@ fun html -> 469 + try 470 + let r1 = Html5rw.parse ~collect_errors:false (reader_of_string html) in 471 + let r2 = Html5rw.parse ~collect_errors:true (reader_of_string html) in 472 + 473 + let s1 = serialize r1 in 474 + let s2 = serialize r2 in 475 + 476 + if s1 <> s2 then begin 477 + Printf.printf "\nError collection changes output!\n"; 478 + Printf.printf "Without: %s\n" (String.escaped (String.sub s1 0 (min 200 (String.length s1)))); 479 + Printf.printf "With: %s\n" (String.escaped (String.sub s2 0 (min 200 (String.length s2)))); 480 + check false 481 + end else 482 + check true 483 + with e -> 484 + Printf.printf "\nError collection exception: %s\n" (Printexc.to_string e); 485 + check false 486 + 487 + (* ========================================================================== 488 + Test 10: Pretty printing should produce parseable HTML 489 + ========================================================================== *) 490 + 491 + (* Helper to normalize whitespace for comparison 492 + Pretty printing adds whitespace that becomes text nodes, so we compare 493 + text content only to verify semantic equivalence. 494 + We collapse all whitespace sequences to single spaces. *) 495 + let normalize_for_comparison result = 496 + let text = Html5rw.to_text ~separator:" " ~strip:true result in 497 + (* Collapse whitespace sequences *) 498 + let buf = Buffer.create (String.length text) in 499 + let in_space = ref false in 500 + String.iter (fun c -> 501 + match c with 502 + | ' ' | '\t' | '\n' | '\r' -> 503 + if not !in_space then begin 504 + Buffer.add_char buf ' '; 505 + in_space := true 506 + end 507 + | c -> 508 + Buffer.add_char buf c; 509 + in_space := false 510 + ) text; 511 + String.trim (Buffer.contents buf) 512 + 513 + let () = 514 + add_test ~name:"html5rw_pretty_print_parseable" [html_gen] @@ fun html -> 515 + try 516 + let r1 = Html5rw.parse (reader_of_string html) in 517 + let pretty = Html5rw.to_string ~pretty:true r1 in 518 + let compact = Html5rw.to_string ~pretty:false r1 in 519 + 520 + (* Both should reparse to have same text content *) 521 + let r_pretty = Html5rw.parse (reader_of_string pretty) in 522 + let r_compact = Html5rw.parse (reader_of_string compact) in 523 + 524 + let text_pretty = normalize_for_comparison r_pretty in 525 + let text_compact = normalize_for_comparison r_compact in 526 + 527 + if text_pretty <> text_compact then begin 528 + Printf.printf "\nPretty/compact text content mismatch!\n"; 529 + Printf.printf "Input: %s\n" (String.escaped (String.sub html 0 (min 100 (String.length html)))); 530 + Printf.printf "Pretty text: %s\n" (String.escaped text_pretty); 531 + Printf.printf "Compact text: %s\n" (String.escaped text_compact); 532 + check false 533 + end else 534 + check true 535 + with e -> 536 + Printf.printf "\nPretty print exception: %s\n" (Printexc.to_string e); 537 + check false
+149
fuzz/fuzz_properties.ml
···
··· 1 + (* Property-based testing for HTML5rw 2 + Tests invariants that should always hold regardless of input *) 3 + 4 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 5 + 6 + (* Property 1: Parsing never raises exceptions on any input *) 7 + let test_parse_no_exception input = 8 + try 9 + let _ = Html5rw.parse (reader_of_string input) in 10 + true 11 + with _ -> false 12 + 13 + (* Property 2: Serialization never raises exceptions *) 14 + let test_serialize_no_exception input = 15 + try 16 + let result = Html5rw.parse (reader_of_string input) in 17 + let _ = Html5rw.to_string result in 18 + true 19 + with _ -> false 20 + 21 + (* Property 3: Serialized output is never longer than a reasonable bound *) 22 + let test_output_bounded input = 23 + try 24 + let result = Html5rw.parse (reader_of_string input) in 25 + let output = Html5rw.to_string ~pretty:false result in 26 + (* Output should not be more than 10x input + base HTML structure *) 27 + String.length output <= (String.length input * 10) + 1000 28 + with _ -> false 29 + 30 + (* Property 4: DOM tree depth is bounded *) 31 + let rec tree_depth node = 32 + let child_depths = List.map tree_depth node.Html5rw.Dom.children in 33 + 1 + (List.fold_left max 0 child_depths) 34 + 35 + let test_depth_bounded input = 36 + try 37 + let result = Html5rw.parse (reader_of_string input) in 38 + let depth = tree_depth (Html5rw.root result) in 39 + (* Depth should not exceed input length (at most one level per char) *) 40 + depth <= String.length input + 10 41 + with _ -> false 42 + 43 + (* Property 5: All text content from input appears somewhere in DOM *) 44 + let rec collect_text node = 45 + if node.Html5rw.Dom.name = "#text" then 46 + [node.Html5rw.Dom.data] 47 + else 48 + List.concat_map collect_text node.Html5rw.Dom.children 49 + 50 + let test_text_preserved input = 51 + try 52 + let result = Html5rw.parse (reader_of_string input) in 53 + let dom_text = String.concat "" (collect_text (Html5rw.root result)) in 54 + (* Every non-tag character should appear in text content or be structural *) 55 + let input_text = Str.global_replace (Str.regexp "<[^>]*>") "" input in 56 + let input_text = Str.global_replace (Str.regexp "&[a-zA-Z]+;") "" input_text in 57 + (* Relaxed check: DOM text should have substantial overlap with input text *) 58 + String.length dom_text >= (String.length input_text / 4) || String.length input_text < 10 59 + with _ -> true (* Parse errors are ok *) 60 + 61 + (* Property 6: Element count is bounded by tag markers in input *) 62 + let rec count_elements node = 63 + let is_element = not (String.length node.Html5rw.Dom.name > 0 && node.Html5rw.Dom.name.[0] = '#') in 64 + let child_count = List.fold_left (+) 0 (List.map count_elements node.Html5rw.Dom.children) in 65 + (if is_element then 1 else 0) + child_count 66 + 67 + let count_char c s = 68 + let count = ref 0 in 69 + String.iter (fun ch -> if ch = c then incr count) s; 70 + !count 71 + 72 + let test_element_count_bounded input = 73 + try 74 + let result = Html5rw.parse (reader_of_string input) in 75 + let elem_count = count_elements (Html5rw.root result) in 76 + let lt_count = count_char '<' input in 77 + (* Element count should not exceed < count + implicit elements (html, head, body) *) 78 + elem_count <= lt_count + 10 79 + with _ -> false 80 + 81 + (* Property 7: Attribute values survive roundtrip (modulo escaping) *) 82 + let rec collect_attrs node = 83 + let own_attrs = node.Html5rw.Dom.attrs in 84 + let child_attrs = List.concat_map collect_attrs node.Html5rw.Dom.children in 85 + own_attrs @ child_attrs 86 + 87 + let unescape_html s = 88 + let s = Str.global_replace (Str.regexp "&amp;") "&" s in 89 + let s = Str.global_replace (Str.regexp "&lt;") "<" s in 90 + let s = Str.global_replace (Str.regexp "&gt;") ">" s in 91 + let s = Str.global_replace (Str.regexp "&quot;") "\"" s in 92 + let s = Str.global_replace (Str.regexp "&#39;") "'" s in 93 + s 94 + 95 + let test_attr_roundtrip input = 96 + try 97 + let r1 = Html5rw.parse (reader_of_string input) in 98 + let s1 = Html5rw.to_string ~pretty:false r1 in 99 + let r2 = Html5rw.parse (reader_of_string s1) in 100 + let attrs1 = collect_attrs (Html5rw.root r1) in 101 + let attrs2 = collect_attrs (Html5rw.root r2) in 102 + (* After roundtrip, attribute values should match (modulo escaping) *) 103 + let normalize_attrs attrs = 104 + List.sort compare (List.map (fun (k, v) -> (k, unescape_html v)) attrs) 105 + in 106 + normalize_attrs attrs1 = normalize_attrs attrs2 || 107 + (* Allow some attrs to be dropped if they have invalid names *) 108 + List.length attrs2 <= List.length attrs1 109 + with _ -> true 110 + 111 + (* Property 8: Idempotent after first roundtrip *) 112 + let test_idempotent input = 113 + try 114 + let r1 = Html5rw.parse (reader_of_string input) in 115 + let s1 = Html5rw.to_string ~pretty:false r1 in 116 + let r2 = Html5rw.parse (reader_of_string s1) in 117 + let s2 = Html5rw.to_string ~pretty:false r2 in 118 + let r3 = Html5rw.parse (reader_of_string s2) in 119 + let s3 = Html5rw.to_string ~pretty:false r3 in 120 + s2 = s3 121 + with _ -> false 122 + 123 + (* Run all property tests *) 124 + let run_all_properties input = 125 + let results = [ 126 + ("parse_no_exception", test_parse_no_exception input); 127 + ("serialize_no_exception", test_serialize_no_exception input); 128 + ("output_bounded", test_output_bounded input); 129 + ("depth_bounded", test_depth_bounded input); 130 + ("text_preserved", test_text_preserved input); 131 + ("element_count_bounded", test_element_count_bounded input); 132 + ("attr_roundtrip", test_attr_roundtrip input); 133 + ("idempotent", test_idempotent input); 134 + ] in 135 + let failures = List.filter (fun (_, ok) -> not ok) results in 136 + if failures <> [] then begin 137 + Printf.printf "PROPERTY FAILURES for input: %s\n" (String.escaped (String.sub input 0 (min 100 (String.length input)))); 138 + List.iter (fun (name, _) -> Printf.printf " - %s\n" name) failures; 139 + false 140 + end else 141 + true 142 + 143 + (* AFL entry point *) 144 + let () = 145 + AflPersistent.run (fun () -> 146 + let input = In_channel.input_all In_channel.stdin in 147 + if not (run_all_properties input) then 148 + exit 1 149 + )
+245
fuzz/fuzz_security.ml
···
··· 1 + (* Security/sanitizer testing for HTML5rw 2 + Tests XSS vectors, mXSS patterns, and security-relevant parsing behavior *) 3 + 4 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 5 + 6 + (* Helper to check if script-like content appears in output *) 7 + let contains_script_tag output = 8 + let output_lower = String.lowercase_ascii output in 9 + String.length output_lower >= 7 && 10 + (try let _ = Str.search_forward (Str.regexp "<script") output_lower 0 in true with Not_found -> false) 11 + 12 + (* Reserved for future use in sanitizer testing *) 13 + let _contains_event_handler output = 14 + let output_lower = String.lowercase_ascii output in 15 + try let _ = Str.search_forward (Str.regexp "on[a-z]+=") output_lower 0 in true with Not_found -> false 16 + 17 + let _contains_javascript_url output = 18 + let output_lower = String.lowercase_ascii output in 19 + try let _ = Str.search_forward (Str.regexp "javascript:") output_lower 0 in true with Not_found -> false 20 + 21 + (* Test parsing and serialization *) 22 + let parse_and_serialize input = 23 + try 24 + let doc = Html5rw.parse (reader_of_string input) in 25 + Some (Html5rw.to_string ~pretty:false doc) 26 + with _ -> None 27 + 28 + (* Category 1: Basic XSS vectors (these should parse cleanly, not be sanitized) *) 29 + let basic_xss_vectors = [| 30 + "<script>alert(1)</script>"; 31 + "<img src=x onerror=alert(1)>"; 32 + "<svg onload=alert(1)>"; 33 + "<body onload=alert(1)>"; 34 + "<a href=\"javascript:alert(1)\">click</a>"; 35 + "<iframe src=\"javascript:alert(1)\">"; 36 + "<input onfocus=alert(1) autofocus>"; 37 + "<marquee onstart=alert(1)>"; 38 + "<video><source onerror=alert(1)>"; 39 + "<details ontoggle=alert(1) open>"; 40 + |] 41 + 42 + (* Category 2: Obfuscated XSS (parser should handle these consistently) *) 43 + let obfuscated_xss = [| 44 + (* Case variations *) 45 + "<ScRiPt>alert(1)</sCrIpT>"; 46 + "<IMG SRC=x ONERROR=alert(1)>"; 47 + "<SVG ONLOAD=alert(1)>"; 48 + 49 + (* Whitespace variations *) 50 + "<script\n>alert(1)</script>"; 51 + "<script\t>alert(1)</script>"; 52 + "<script\r>alert(1)</script>"; 53 + "<img src=x\nonerror=alert(1)>"; 54 + 55 + (* Null bytes (should be handled) *) 56 + "<scr\x00ipt>alert(1)</script>"; 57 + "<img src=x onerr\x00or=alert(1)>"; 58 + 59 + (* Entity encoding in attributes *) 60 + "<a href=\"java&#115;cript:alert(1)\">x</a>"; 61 + "<a href=\"java&#x73;cript:alert(1)\">x</a>"; 62 + "<img src=x onerror=&#97;lert(1)>"; 63 + |] 64 + 65 + (* Category 3: mXSS patterns (mutation XSS through parser quirks) *) 66 + let mxss_patterns = [| 67 + (* Backtick in attributes *) 68 + "<img src=`x`onerror=alert(1)>"; 69 + "<div style=`background:url(x)`onmouseover=alert(1)>"; 70 + 71 + (* Unclosed tags/attributes *) 72 + "<img src=\"x\" onerror=\"alert(1)"; 73 + "<img src=x onerror=alert(1)//"; 74 + "<div attr=\"></div><script>alert(1)</script>"; 75 + 76 + (* Tag breaking *) 77 + "<div><script>alert(1)</script"; 78 + "<div><<script>alert(1)</script>"; 79 + "</title><script>alert(1)</script>"; 80 + 81 + (* Foreign content escapes *) 82 + "<svg><![CDATA[<script>alert(1)</script>]]></svg>"; 83 + "<svg><foreignObject><script>alert(1)</script></foreignObject></svg>"; 84 + "<math><mtext><script>alert(1)</script></mtext></math>"; 85 + 86 + (* Template injection *) 87 + "<template><script>alert(1)</script></template>"; 88 + 89 + (* Noscript edge cases *) 90 + "<noscript><script>alert(1)</script></noscript>"; 91 + |] 92 + 93 + (* Category 4: Attribute injection patterns *) 94 + let attr_injection = [| 95 + (* Breaking out of attributes *) 96 + "<div title=\"x\" onclick=\"alert(1)\">x</div>"; 97 + "<div title='x' onclick='alert(1)'>x</div>"; 98 + "<div title=x onclick=alert(1)>x</div>"; 99 + 100 + (* Attribute without value *) 101 + "<input value=\"x\" onfocus autofocus>"; 102 + 103 + (* Multiple attributes *) 104 + "<div a=1 b=2 onclick=alert(1) c=3>x</div>"; 105 + 106 + (* Quote mismatches *) 107 + "<div title=\"x'onclick=alert(1)//\">x</div>"; 108 + "<div title='x\"onclick=alert(1)//'>x</div>"; 109 + 110 + (* Entity in attribute names *) 111 + "<div o&#110;click=alert(1)>x</div>"; 112 + |] 113 + 114 + (* Category 5: URL-based attacks *) 115 + let url_attacks = [| 116 + "<a href=\"javascript:alert(1)\">x</a>"; 117 + "<a href=\"JAVASCRIPT:alert(1)\">x</a>"; 118 + "<a href=\" javascript:alert(1)\">x</a>"; 119 + "<a href=\"&#106;avascript:alert(1)\">x</a>"; 120 + "<a href=\"java\tscript:alert(1)\">x</a>"; 121 + "<a href=\"java\nscript:alert(1)\">x</a>"; 122 + "<a href=\"java\rscript:alert(1)\">x</a>"; 123 + "<a href=\"data:text/html,<script>alert(1)</script>\">x</a>"; 124 + "<a href=\"vbscript:alert(1)\">x</a>"; 125 + "<iframe src=\"javascript:alert(1)\">"; 126 + "<embed src=\"javascript:alert(1)\">"; 127 + "<object data=\"javascript:alert(1)\">"; 128 + "<form action=\"javascript:alert(1)\">"; 129 + |] 130 + 131 + (* Category 6: Style-based attacks *) 132 + let style_attacks = [| 133 + "<div style=\"background:url(javascript:alert(1))\">x</div>"; 134 + "<div style=\"expression(alert(1))\">x</div>"; 135 + "<div style=\"-moz-binding:url(http://evil.com/xss.xml#xss)\">x</div>"; 136 + "<style>@import 'http://evil.com/xss.css';</style>"; 137 + "<style>body { background: url('javascript:alert(1)'); }</style>"; 138 + "<link rel=\"stylesheet\" href=\"javascript:alert(1)\">"; 139 + |] 140 + 141 + (* Category 7: Tag soup and parser confusion *) 142 + let tag_soup = [| 143 + "<div<div>test</div>"; 144 + "<div<<div>>test</div>"; 145 + "<<div>>test</div>"; 146 + "<div>test</div</div>>"; 147 + "</</div>>"; 148 + "</ div>"; 149 + "<div / onclick=alert(1)>"; 150 + "<div/onclick=alert(1)>"; 151 + "<div><</div>"; 152 + "<div>></div>"; 153 + "<div>&lt;script&gt;alert(1)&lt;/script&gt;</div>"; 154 + |] 155 + 156 + (* Test that parsing is stable (no mXSS through parse-serialize-parse) *) 157 + let test_mxss_stability input = 158 + match parse_and_serialize input with 159 + | None -> (true, "parse failed") (* Parse failure is ok for malformed input *) 160 + | Some s1 -> 161 + match parse_and_serialize s1 with 162 + | None -> (false, "re-parse failed") 163 + | Some s2 -> 164 + match parse_and_serialize s2 with 165 + | None -> (false, "third parse failed") 166 + | Some s3 -> 167 + if s2 = s3 then (true, "stable") 168 + else (false, Printf.sprintf "unstable: s2=%s s3=%s" (String.escaped s2) (String.escaped s3)) 169 + 170 + (* Test that dangerous content doesn't appear after parsing innocuous-looking input *) 171 + let test_no_script_injection input = 172 + if contains_script_tag input then 173 + (* If input has script, we expect output might too *) 174 + true 175 + else 176 + match parse_and_serialize input with 177 + | None -> true 178 + | Some output -> 179 + if contains_script_tag output then begin 180 + Printf.printf "SCRIPT TAG APPEARED:\n"; 181 + Printf.printf " Input: %s\n" (String.escaped input); 182 + Printf.printf " Output: %s\n" (String.escaped output); 183 + false 184 + end else 185 + true 186 + 187 + let run_test_category name cases = 188 + Printf.printf "--- %s ---\n" name; 189 + let stable_count = ref 0 in 190 + let unstable_count = ref 0 in 191 + Array.iter (fun input -> 192 + let (stable, msg) = test_mxss_stability input in 193 + if stable then incr stable_count 194 + else begin 195 + Printf.printf "UNSTABLE: %s\n" (String.escaped (String.sub input 0 (min 60 (String.length input)))); 196 + Printf.printf " %s\n" msg; 197 + incr unstable_count 198 + end 199 + ) cases; 200 + Printf.printf "%s: %d stable, %d unstable\n\n" name !stable_count !unstable_count; 201 + !unstable_count = 0 202 + 203 + let () = 204 + Printf.printf "=== Security/Sanitizer Tests ===\n\n"; 205 + 206 + let all_pass = ref true in 207 + 208 + (* Test each category for mXSS stability *) 209 + if not (run_test_category "Basic XSS vectors" basic_xss_vectors) then 210 + all_pass := false; 211 + if not (run_test_category "Obfuscated XSS" obfuscated_xss) then 212 + all_pass := false; 213 + if not (run_test_category "mXSS patterns" mxss_patterns) then 214 + all_pass := false; 215 + if not (run_test_category "Attribute injection" attr_injection) then 216 + all_pass := false; 217 + if not (run_test_category "URL attacks" url_attacks) then 218 + all_pass := false; 219 + if not (run_test_category "Style attacks" style_attacks) then 220 + all_pass := false; 221 + if not (run_test_category "Tag soup" tag_soup) then 222 + all_pass := false; 223 + 224 + (* Test for script tag injection *) 225 + Printf.printf "--- Script injection tests ---\n"; 226 + let inject_pass = ref 0 in 227 + let inject_fail = ref 0 in 228 + let non_script_inputs = Array.concat [attr_injection; tag_soup] in 229 + Array.iter (fun input -> 230 + if test_no_script_injection input then 231 + incr inject_pass 232 + else 233 + incr inject_fail 234 + ) non_script_inputs; 235 + Printf.printf "No unexpected script injection: %d/%d\n\n" 236 + !inject_pass (!inject_pass + !inject_fail); 237 + if !inject_fail > 0 then all_pass := false; 238 + 239 + Printf.printf "=== Summary ===\n"; 240 + if !all_pass then 241 + Printf.printf "All security tests passed!\n" 242 + else begin 243 + Printf.printf "Some security tests failed!\n"; 244 + exit 1 245 + end
+215
fuzz/fuzz_serializer.ml
···
··· 1 + (* Serializer-specific fuzzer for HTML5rw 2 + Tests serialization edge cases and formatting options *) 3 + 4 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 5 + 6 + (* Normalize whitespace for comparison - removes formatting differences *) 7 + let normalize_whitespace s = 8 + let s = Str.global_replace (Str.regexp "[\n\r\t ]+") " " s in 9 + let s = Str.global_replace (Str.regexp "> <") "><" s in 10 + String.trim s 11 + 12 + (* Test serialization with different pretty-print settings *) 13 + let test_pretty_modes input = 14 + try 15 + let doc = Html5rw.parse (reader_of_string input) in 16 + 17 + let s_compact = Html5rw.to_string ~pretty:false doc in 18 + let s_pretty = Html5rw.to_string ~pretty:true doc in 19 + 20 + (* Both should parse back to equivalent DOMs *) 21 + let doc_compact = Html5rw.parse (reader_of_string s_compact) in 22 + let doc_pretty = Html5rw.parse (reader_of_string s_pretty) in 23 + 24 + let s_compact2 = Html5rw.to_string ~pretty:false doc_compact in 25 + let s_pretty2 = Html5rw.to_string ~pretty:false doc_pretty in 26 + 27 + (* Compact versions should be identical (roundtrip stable) *) 28 + if s_compact <> s_compact2 then begin 29 + Printf.printf "Compact roundtrip mismatch:\n"; 30 + Printf.printf " s_compact: %s\n" (String.escaped s_compact); 31 + Printf.printf " s_compact2: %s\n" (String.escaped s_compact2); 32 + false 33 + end else begin 34 + (* Pretty and compact should have same semantic content (modulo whitespace) *) 35 + let norm_compact = normalize_whitespace s_compact in 36 + let norm_pretty2 = normalize_whitespace s_pretty2 in 37 + if norm_compact <> norm_pretty2 then begin 38 + Printf.printf "Pretty/compact semantic mismatch:\n"; 39 + Printf.printf " From compact: %s\n" (String.escaped norm_compact); 40 + Printf.printf " From pretty: %s\n" (String.escaped norm_pretty2); 41 + false 42 + end else 43 + true 44 + end 45 + with e -> 46 + Printf.printf "Exception: %s\n" (Printexc.to_string e); 47 + Printf.printf " Input: %s\n" (String.escaped input); 48 + false 49 + 50 + (* Test attribute serialization *) 51 + let attr_test_cases = [| 52 + (* Basic attributes *) 53 + "<div id=\"test\"></div>"; 54 + "<div class=\"foo bar\"></div>"; 55 + "<div data-x=\"value\"></div>"; 56 + 57 + (* Quoting *) 58 + "<div attr=\"a'b\"></div>"; 59 + "<div attr='a\"b'></div>"; 60 + "<div attr=\"a&quot;b\"></div>"; 61 + "<div attr='a&apos;b'></div>"; 62 + 63 + (* Special characters *) 64 + "<div attr=\"a<b\"></div>"; 65 + "<div attr=\"a>b\"></div>"; 66 + "<div attr=\"a&b\"></div>"; 67 + "<div attr=\"a&amp;b\"></div>"; 68 + 69 + (* Empty and valueless *) 70 + "<div attr></div>"; 71 + "<div attr=\"\"></div>"; 72 + "<input disabled>"; 73 + "<input checked>"; 74 + 75 + (* Whitespace *) 76 + "<div attr=\"a b\"></div>"; 77 + "<div attr=\"a\nb\"></div>"; 78 + "<div attr=\"a\tb\"></div>"; 79 + 80 + (* URLs *) 81 + "<a href=\"http://example.com?a=1&b=2\"></a>"; 82 + "<a href=\"javascript:alert('x')\"></a>"; 83 + "<a href=\"data:text/html,<script>x</script>\"></a>"; 84 + 85 + (* Event handlers *) 86 + "<div onclick=\"alert(&quot;x&quot;)\"></div>"; 87 + "<div onclick='alert(\"x\")'></div>"; 88 + 89 + (* Multiple attributes *) 90 + "<div a=\"1\" b=\"2\" c=\"3\"></div>"; 91 + "<div a b c></div>"; 92 + |] 93 + 94 + (* Test void element serialization *) 95 + let void_test_cases = [| 96 + "<br>"; 97 + "<br/>"; 98 + "<br />"; 99 + "<hr>"; 100 + "<img src=\"x\">"; 101 + "<input type=\"text\">"; 102 + "<meta charset=\"utf-8\">"; 103 + "<link rel=\"stylesheet\">"; 104 + "<area>"; 105 + "<base href=\"/\">"; 106 + "<col span=\"2\">"; 107 + "<embed src=\"x\">"; 108 + "<source src=\"x\">"; 109 + "<track src=\"x\">"; 110 + "<wbr>"; 111 + |] 112 + 113 + (* Test raw text element serialization *) 114 + (* Note: Test cases with </script> inside script are omitted because they are 115 + intentionally invalid HTML and the parser correctly terminates at </script> *) 116 + let rawtext_test_cases = [| 117 + "<script>var x = 1;</script>"; 118 + "<script>var x = '<div>';</script>"; 119 + "<style>.x { color: red; }</style>"; 120 + "<textarea>Hello world</textarea>"; 121 + "<textarea><div>not an element</div></textarea>"; 122 + "<title>Page &amp; Title</title>"; 123 + "<xmp><div>preformatted</div></xmp>"; 124 + |] 125 + 126 + (* Test whitespace preservation *) 127 + let whitespace_test_cases = [| 128 + "<pre> spaces </pre>"; 129 + "<pre>\n\nlines\n\n</pre>"; 130 + "<pre>\ttabs\t</pre>"; 131 + "<code> code </code>"; 132 + "<textarea> text </textarea>"; 133 + "<div> text </div>"; 134 + "<p> text </p>"; 135 + |] 136 + 137 + (* Test entity serialization *) 138 + let entity_test_cases = [| 139 + "<div>&amp;</div>"; 140 + "<div>&lt;</div>"; 141 + "<div>&gt;</div>"; 142 + "<div>&quot;</div>"; 143 + "<div>&apos;</div>"; 144 + "<div>&nbsp;</div>"; 145 + "<div>&#60;</div>"; 146 + "<div>&#x3C;</div>"; 147 + "<div>&copy;</div>"; 148 + "<div>&mdash;</div>"; 149 + |] 150 + 151 + (* Test nested structure serialization *) 152 + let nested_test_cases = [| 153 + "<div><div><div></div></div></div>"; 154 + "<table><tbody><tr><td><table><tbody><tr><td></td></tr></tbody></table></td></tr></tbody></table>"; 155 + "<ul><li><ul><li><ul><li></li></ul></li></ul></li></ul>"; 156 + "<dl><dt><dl><dt></dt></dl></dt></dl>"; 157 + |] 158 + 159 + (* Test foreign content serialization *) 160 + let foreign_test_cases = [| 161 + "<svg></svg>"; 162 + "<svg viewBox=\"0 0 100 100\"><circle cx=\"50\" cy=\"50\" r=\"40\"/></svg>"; 163 + "<svg><text>Hello</text></svg>"; 164 + "<math></math>"; 165 + "<math><mi>x</mi><mo>=</mo><mn>1</mn></math>"; 166 + "<svg xmlns=\"http://www.w3.org/2000/svg\"></svg>"; 167 + |] 168 + 169 + let run_test_category name cases test_fn = 170 + let passed = ref 0 in 171 + let failed = ref 0 in 172 + Array.iter (fun input -> 173 + if test_fn input then 174 + incr passed 175 + else begin 176 + Printf.printf "FAIL: %s\n" (String.escaped input); 177 + incr failed 178 + end 179 + ) cases; 180 + Printf.printf "%s: %d/%d\n" name !passed (Array.length cases); 181 + !failed = 0 182 + 183 + let () = 184 + let all_pass = ref true in 185 + 186 + Printf.printf "=== Serializer Tests ===\n\n"; 187 + 188 + if not (run_test_category "Attribute serialization" attr_test_cases test_pretty_modes) then 189 + all_pass := false; 190 + 191 + if not (run_test_category "Void elements" void_test_cases test_pretty_modes) then 192 + all_pass := false; 193 + 194 + if not (run_test_category "Raw text elements" rawtext_test_cases test_pretty_modes) then 195 + all_pass := false; 196 + 197 + if not (run_test_category "Whitespace preservation" whitespace_test_cases test_pretty_modes) then 198 + all_pass := false; 199 + 200 + if not (run_test_category "Entity serialization" entity_test_cases test_pretty_modes) then 201 + all_pass := false; 202 + 203 + if not (run_test_category "Nested structures" nested_test_cases test_pretty_modes) then 204 + all_pass := false; 205 + 206 + if not (run_test_category "Foreign content" foreign_test_cases test_pretty_modes) then 207 + all_pass := false; 208 + 209 + Printf.printf "\n=== Summary ===\n"; 210 + if !all_pass then 211 + Printf.printf "All serializer tests passed!\n" 212 + else begin 213 + Printf.printf "Some serializer tests failed!\n"; 214 + exit 1 215 + end
+214
fuzz/fuzz_streaming.ml
···
··· 1 + (* Streaming/incremental fuzzer for HTML5rw 2 + Tests that parsing produces identical results regardless of input characteristics *) 3 + 4 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 5 + 6 + (* Test that repeated parsing produces stable results *) 7 + let test_stability input = 8 + try 9 + (* Parse multiple times and ensure consistent results *) 10 + let doc1 = Html5rw.parse (reader_of_string input) in 11 + let s1 = Html5rw.to_string ~pretty:false doc1 in 12 + 13 + let doc2 = Html5rw.parse (reader_of_string input) in 14 + let s2 = Html5rw.to_string ~pretty:false doc2 in 15 + 16 + let doc3 = Html5rw.parse (reader_of_string input) in 17 + let s3 = Html5rw.to_string ~pretty:false doc3 in 18 + 19 + if s1 <> s2 || s2 <> s3 then begin 20 + Printf.printf "PARSING NOT DETERMINISTIC:\n"; 21 + Printf.printf " S1: %s\n" (String.escaped (String.sub s1 0 (min 100 (String.length s1)))); 22 + Printf.printf " S2: %s\n" (String.escaped (String.sub s2 0 (min 100 (String.length s2)))); 23 + Printf.printf " S3: %s\n" (String.escaped (String.sub s3 0 (min 100 (String.length s3)))); 24 + false 25 + end else 26 + true 27 + with e -> 28 + Printf.printf "Exception: %s\n" (Printexc.to_string e); 29 + Printf.printf " Input: %s\n" (String.escaped (String.sub input 0 (min 100 (String.length input)))); 30 + false 31 + 32 + (* Test roundtrip stability *) 33 + let test_roundtrip input = 34 + try 35 + let doc1 = Html5rw.parse (reader_of_string input) in 36 + let s1 = Html5rw.to_string ~pretty:false doc1 in 37 + let doc2 = Html5rw.parse (reader_of_string s1) in 38 + let s2 = Html5rw.to_string ~pretty:false doc2 in 39 + let doc3 = Html5rw.parse (reader_of_string s2) in 40 + let s3 = Html5rw.to_string ~pretty:false doc3 in 41 + if s2 <> s3 then begin 42 + Printf.printf "ROUNDTRIP UNSTABLE:\n"; 43 + Printf.printf " Input: %s\n" (String.escaped (String.sub input 0 (min 50 (String.length input)))); 44 + Printf.printf " S2: %s\n" (String.escaped (String.sub s2 0 (min 100 (String.length s2)))); 45 + Printf.printf " S3: %s\n" (String.escaped (String.sub s3 0 (min 100 (String.length s3)))); 46 + false 47 + end else 48 + true 49 + with e -> 50 + Printf.printf "Exception in roundtrip: %s\n" (Printexc.to_string e); 51 + false 52 + 53 + (* Test cases that stress boundary handling *) 54 + let boundary_test_cases = [| 55 + (* Tags split at various positions *) 56 + "<div></div>"; 57 + "<div attr=\"value\"></div>"; 58 + "<!DOCTYPE html>"; 59 + "<!-- comment -->"; 60 + 61 + (* Entity references *) 62 + "&amp;"; 63 + "&lt;test&gt;"; 64 + "&#60;"; 65 + "&#x3C;"; 66 + 67 + (* Multi-byte UTF-8 *) 68 + "caf\xC3\xA9"; (* cafรฉ *) 69 + "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"; (* ๆ—ฅๆœฌ่ชž *) 70 + "\xF0\x9F\x8E\x89"; (* ๐ŸŽ‰ *) 71 + "<div>\xE4\xB8\xAD\xE6\x96\x87</div>"; 72 + 73 + (* Script/style content *) 74 + "<script>var x = 1;</script>"; 75 + "<style>.x { color: red; }</style>"; 76 + 77 + (* CDATA-like in script *) 78 + "<script>//<![CDATA[\nvar x = 1;\n//]]></script>"; 79 + 80 + (* Long strings *) 81 + String.make 100 'x'; 82 + "<div>" ^ String.make 100 'x' ^ "</div>"; 83 + 84 + (* Many small tags *) 85 + String.concat "" (List.init 20 (fun _ -> "<b>x</b>")); 86 + 87 + (* Whitespace variations *) 88 + "<div attr = 'value' ></div>"; 89 + "<pre>\n\n\ntext\n\n\n</pre>"; 90 + 91 + (* Mixed content *) 92 + "<div>text<span>more</span>text</div>"; 93 + |] 94 + 95 + (* Additional edge cases *) 96 + let edge_cases = [| 97 + (* Empty *) 98 + ""; 99 + 100 + (* Just whitespace *) 101 + " "; 102 + "\n\n\n"; 103 + "\t\t\t"; 104 + 105 + (* Single characters *) 106 + "<"; 107 + ">"; 108 + "&"; 109 + "/"; 110 + 111 + (* Partial tags *) 112 + "<d"; 113 + "<di"; 114 + "<div"; 115 + "<div>"; 116 + 117 + (* CR/LF variations *) 118 + "<div>\r\n</div>"; 119 + "<div>\r</div>"; 120 + "<div>\n</div>"; 121 + "line1\r\nline2\rline3\nline4"; 122 + 123 + (* Multiple documents *) 124 + "<!DOCTYPE html><html><body>x</body></html>"; 125 + |] 126 + 127 + (* Test that pretty-printing produces valid HTML that roundtrips properly *) 128 + let test_pretty_roundtrip input = 129 + try 130 + let doc = Html5rw.parse (reader_of_string input) in 131 + let s_pretty = Html5rw.to_string ~pretty:true doc in 132 + 133 + (* Pretty output should roundtrip-stable *) 134 + let doc_pretty = Html5rw.parse (reader_of_string s_pretty) in 135 + let s_pretty2 = Html5rw.to_string ~pretty:true doc_pretty in 136 + let doc_pretty2 = Html5rw.parse (reader_of_string s_pretty2) in 137 + let s_pretty3 = Html5rw.to_string ~pretty:true doc_pretty2 in 138 + 139 + if s_pretty2 <> s_pretty3 then begin 140 + Printf.printf "PRETTY ROUNDTRIP UNSTABLE:\n"; 141 + Printf.printf " S2: %s\n" (String.escaped (String.sub s_pretty2 0 (min 100 (String.length s_pretty2)))); 142 + Printf.printf " S3: %s\n" (String.escaped (String.sub s_pretty3 0 (min 100 (String.length s_pretty3)))); 143 + false 144 + end else 145 + true 146 + with e -> 147 + Printf.printf "Exception: %s\n" (Printexc.to_string e); 148 + false 149 + 150 + (* Run all tests *) 151 + let run_all_tests () = 152 + let test_cases = Array.concat [boundary_test_cases; edge_cases] in 153 + let all_pass = ref true in 154 + 155 + Printf.printf "=== Streaming/Stability Tests ===\n\n"; 156 + 157 + (* Test parsing stability *) 158 + Printf.printf "--- Parsing stability ---\n"; 159 + let stable_pass = ref 0 in 160 + let stable_fail = ref 0 in 161 + Array.iter (fun input -> 162 + if test_stability input then 163 + incr stable_pass 164 + else 165 + incr stable_fail 166 + ) test_cases; 167 + Printf.printf "Parsing stability: %d/%d\n" !stable_pass (Array.length test_cases); 168 + if !stable_fail > 0 then all_pass := false; 169 + 170 + (* Test roundtrip stability *) 171 + Printf.printf "\n--- Roundtrip stability ---\n"; 172 + let rt_pass = ref 0 in 173 + let rt_fail = ref 0 in 174 + Array.iter (fun input -> 175 + if test_roundtrip input then 176 + incr rt_pass 177 + else 178 + incr rt_fail 179 + ) test_cases; 180 + Printf.printf "Roundtrip stability: %d/%d\n" !rt_pass (Array.length test_cases); 181 + if !rt_fail > 0 then all_pass := false; 182 + 183 + (* Test pretty printing roundtrip stability *) 184 + Printf.printf "\n--- Pretty printing roundtrip ---\n"; 185 + let pretty_pass = ref 0 in 186 + let pretty_fail = ref 0 in 187 + Array.iter (fun input -> 188 + if test_pretty_roundtrip input then 189 + incr pretty_pass 190 + else 191 + incr pretty_fail 192 + ) test_cases; 193 + Printf.printf "Pretty roundtrip: %d/%d\n" !pretty_pass (Array.length test_cases); 194 + if !pretty_fail > 0 then all_pass := false; 195 + 196 + Printf.printf "\n=== Summary ===\n"; 197 + if !all_pass then 198 + Printf.printf "All streaming/stability tests passed!\n" 199 + else begin 200 + Printf.printf "Some tests failed!\n"; 201 + exit 1 202 + end 203 + 204 + (* AFL entry point for fuzz testing *) 205 + let fuzz_mode () = 206 + let input = In_channel.input_all In_channel.stdin in 207 + if not (test_stability input && test_roundtrip input) then 208 + exit 1 209 + 210 + let () = 211 + if Array.length Sys.argv > 1 && Sys.argv.(1) = "--fuzz" then 212 + fuzz_mode () 213 + else 214 + run_all_tests ()
+176
fuzz/fuzz_structure.ml
···
··· 1 + (* Structure-aware HTML fuzzer 2 + Generates and mutates syntactically plausible HTML to find edge cases *) 3 + 4 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 5 + 6 + (* Common HTML elements for generation *) 7 + let void_elements = [| "br"; "hr"; "img"; "input"; "meta"; "link"; "area"; "base"; "col"; "embed"; "source"; "track"; "wbr" |] 8 + let block_elements = [| "div"; "p"; "h1"; "h2"; "h3"; "section"; "article"; "header"; "footer"; "main"; "nav"; "aside"; "blockquote"; "pre"; "ul"; "ol"; "li"; "dl"; "dt"; "dd"; "figure"; "figcaption"; "table"; "form"; "fieldset" |] 9 + let inline_elements = [| "span"; "a"; "em"; "strong"; "b"; "i"; "u"; "s"; "small"; "big"; "code"; "kbd"; "var"; "samp"; "cite"; "q"; "abbr"; "time"; "mark"; "sub"; "sup" |] 10 + (* Formatting elements for adoption agency algorithm testing *) 11 + let _formatting_elements = [| "a"; "b"; "big"; "code"; "em"; "font"; "i"; "nobr"; "s"; "small"; "strike"; "strong"; "tt"; "u" |] 12 + let table_elements = [| "table"; "thead"; "tbody"; "tfoot"; "tr"; "th"; "td"; "caption"; "colgroup"; "col" |] 13 + let special_elements = [| "script"; "style"; "template"; "svg"; "math"; "textarea"; "title"; "noscript"; "iframe"; "xmp"; "plaintext" |] 14 + 15 + let all_elements = Array.concat [void_elements; block_elements; inline_elements; table_elements; special_elements] 16 + 17 + (* Random selection *) 18 + let pick arr = arr.(Random.int (Array.length arr)) 19 + 20 + (* Generate random attribute *) 21 + let gen_attr_name () = 22 + let names = [| "id"; "class"; "style"; "href"; "src"; "alt"; "title"; "name"; "value"; "type"; "data-x"; "aria-label"; "onclick"; "onload" |] in 23 + pick names 24 + 25 + let gen_attr_value () = 26 + let values = [| ""; "x"; "test"; "foo bar"; "a\"b"; "a'b"; "a<b"; "a>b"; "a&b"; "<script>"; "javascript:"; "&#x0;"; "\x00"; "\n\t" |] in 27 + pick values 28 + 29 + let gen_attrs n = 30 + let buf = Buffer.create 64 in 31 + for _ = 1 to n do 32 + Buffer.add_char buf ' '; 33 + Buffer.add_string buf (gen_attr_name ()); 34 + if Random.bool () then begin 35 + Buffer.add_string buf "=\""; 36 + Buffer.add_string buf (gen_attr_value ()); 37 + Buffer.add_char buf '"' 38 + end 39 + done; 40 + Buffer.contents buf 41 + 42 + (* Generate random HTML *) 43 + let rec gen_html depth max_depth = 44 + if depth >= max_depth then 45 + (* Terminal: text or void element *) 46 + if Random.bool () then 47 + let texts = [| "hello"; "world"; "<text>"; "&amp;"; ""; " "; "\n" |] in 48 + pick texts 49 + else 50 + "<" ^ pick void_elements ^ gen_attrs (Random.int 3) ^ ">" 51 + else 52 + let tag = pick all_elements in 53 + let attrs = gen_attrs (Random.int 4) in 54 + let is_void = Array.mem tag void_elements in 55 + if is_void then 56 + "<" ^ tag ^ attrs ^ ">" 57 + else 58 + let children = Buffer.create 256 in 59 + let num_children = Random.int 5 in 60 + for _ = 1 to num_children do 61 + Buffer.add_string children (gen_html (depth + 1) max_depth) 62 + done; 63 + "<" ^ tag ^ attrs ^ ">" ^ Buffer.contents children ^ "</" ^ tag ^ ">" 64 + 65 + (* Mutation strategies *) 66 + type mutation = 67 + | DeleteChar of int 68 + | InsertChar of int * char 69 + | SwapChars of int * int 70 + | DuplicateRange of int * int 71 + | CorruptTag of int 72 + | UnclosedTag of int 73 + | MisnestedTags 74 + | InsertNullByte of int 75 + | TruncateAt of int 76 + 77 + let apply_mutation input mutation = 78 + let len = String.length input in 79 + if len = 0 then input else 80 + match mutation with 81 + | DeleteChar i when i < len -> 82 + String.sub input 0 i ^ String.sub input (i + 1) (len - i - 1) 83 + | InsertChar (i, c) when i <= len -> 84 + String.sub input 0 i ^ String.make 1 c ^ String.sub input i (len - i) 85 + | SwapChars (i, j) when i < len && j < len -> 86 + let bytes = Bytes.of_string input in 87 + let tmp = Bytes.get bytes i in 88 + Bytes.set bytes i (Bytes.get bytes j); 89 + Bytes.set bytes j tmp; 90 + Bytes.to_string bytes 91 + | DuplicateRange (start, len') when start < len && start + len' <= len -> 92 + let range = String.sub input start len' in 93 + String.sub input 0 start ^ range ^ range ^ String.sub input (start + len') (len - start - len') 94 + | CorruptTag i -> 95 + (* Find a < after position i and corrupt the tag *) 96 + (try 97 + let tag_start = String.index_from input (min i (len - 1)) '<' in 98 + String.sub input 0 (tag_start + 1) ^ "\x00" ^ String.sub input (tag_start + 1) (len - tag_start - 1) 99 + with Not_found -> input) 100 + | UnclosedTag i -> 101 + (* Find a </ after position i and remove it *) 102 + (try 103 + let close_start = Str.search_forward (Str.regexp "</") input (min i (len - 1)) in 104 + let close_end = String.index_from input close_start '>' in 105 + String.sub input 0 close_start ^ String.sub input (close_end + 1) (len - close_end - 1) 106 + with Not_found -> input) 107 + | MisnestedTags -> 108 + (* Generate misnested tags *) 109 + let tags = [| "a"; "b"; "i"; "em"; "strong"; "span"; "div"; "p" |] in 110 + let t1, t2 = pick tags, pick tags in 111 + input ^ "<" ^ t1 ^ "><" ^ t2 ^ "></" ^ t1 ^ "></" ^ t2 ^ ">" 112 + | InsertNullByte i when i < len -> 113 + String.sub input 0 i ^ "\x00" ^ String.sub input i (len - i) 114 + | TruncateAt i when i < len -> 115 + String.sub input 0 i 116 + | _ -> input 117 + 118 + let random_mutation input = 119 + let len = String.length input in 120 + let mutations = [| 121 + DeleteChar (Random.int (max 1 len)); 122 + InsertChar (Random.int (len + 1), Char.chr (Random.int 256)); 123 + SwapChars (Random.int (max 1 len), Random.int (max 1 len)); 124 + DuplicateRange (Random.int (max 1 len), Random.int (min 50 (max 1 len))); 125 + CorruptTag (Random.int (max 1 len)); 126 + UnclosedTag (Random.int (max 1 len)); 127 + MisnestedTags; 128 + InsertNullByte (Random.int (max 1 len)); 129 + TruncateAt (Random.int (max 1 len)); 130 + |] in 131 + apply_mutation input (pick mutations) 132 + 133 + (* Test function: parse, serialize, and check for crashes/hangs *) 134 + let test_html input = 135 + try 136 + let r1 = Html5rw.parse (reader_of_string input) in 137 + let s1 = Html5rw.to_string ~pretty:false r1 in 138 + let r2 = Html5rw.parse (reader_of_string s1) in 139 + let s2 = Html5rw.to_string ~pretty:false r2 in 140 + let r3 = Html5rw.parse (reader_of_string s2) in 141 + let s3 = Html5rw.to_string ~pretty:false r3 in 142 + if s2 <> s3 then begin 143 + Printf.printf "ROUNDTRIP FAILED:\n"; 144 + Printf.printf "Input: %s\n" (String.escaped (String.sub input 0 (min 200 (String.length input)))); 145 + Printf.printf "s2: %s\n" (String.escaped (String.sub s2 0 (min 200 (String.length s2)))); 146 + Printf.printf "s3: %s\n" (String.escaped (String.sub s3 0 (min 200 (String.length s3)))); 147 + false 148 + end else 149 + true 150 + with e -> 151 + Printf.printf "EXCEPTION: %s\n" (Printexc.to_string e); 152 + Printf.printf "Input: %s\n" (String.escaped (String.sub input 0 (min 200 (String.length input)))); 153 + false 154 + 155 + (* Main fuzzing loop *) 156 + let () = 157 + Random.self_init (); 158 + 159 + (* Mode: generate or mutate based on stdin *) 160 + let input = In_channel.input_all In_channel.stdin in 161 + 162 + let test_input = 163 + if String.length input < 10 then 164 + (* Generate random HTML *) 165 + gen_html 0 (3 + Random.int 5) 166 + else 167 + (* Mutate the input *) 168 + let mutated = ref input in 169 + for _ = 1 to 1 + Random.int 5 do 170 + mutated := random_mutation !mutated 171 + done; 172 + !mutated 173 + in 174 + 175 + if not (test_html test_input) then 176 + exit 1
+58
fuzz/test_crash.ml
···
··· 1 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 2 + let serialize result = Html5rw.to_string ~pretty:false result 3 + 4 + let rec dump_node indent (node : Html5rw.Dom.node) = 5 + let padding = String.make (indent * 2) ' ' in 6 + Printf.printf "%s[%s]" padding (String.escaped node.name); 7 + (match node.namespace with Some ns -> Printf.printf " ns=%s" ns | None -> ()); 8 + if node.attrs <> [] then begin 9 + Printf.printf " attrs=%d" (List.length node.attrs) 10 + end; 11 + Printf.printf "\n"; 12 + List.iter (dump_node (indent + 1)) node.children 13 + 14 + let () = 15 + let input = In_channel.input_all (In_channel.open_bin Sys.argv.(1)) in 16 + let r1 = Html5rw.parse (reader_of_string input) in 17 + let s1 = serialize r1 in 18 + Printf.printf "=== After 1st parse ===\n"; 19 + dump_node 0 (Html5rw.root r1); 20 + Printf.printf "\ns1 (%d): %s\n\n" (String.length s1) (String.escaped (String.sub s1 0 (min 200 (String.length s1)))); 21 + 22 + let r2 = Html5rw.parse (reader_of_string s1) in 23 + let s2 = serialize r2 in 24 + Printf.printf "=== After 2nd parse ===\n"; 25 + dump_node 0 (Html5rw.root r2); 26 + Printf.printf "\ns2 (%d): %s\n\n" (String.length s2) (String.escaped (String.sub s2 0 (min 200 (String.length s2)))); 27 + 28 + let r3 = Html5rw.parse (reader_of_string s2) in 29 + let s3 = serialize r3 in 30 + Printf.printf "=== After 3rd parse ===\n"; 31 + dump_node 0 (Html5rw.root r3); 32 + Printf.printf "\ns3 (%d): %s\n\n" (String.length s3) (String.escaped (String.sub s3 0 (min 200 (String.length s3)))); 33 + 34 + Printf.printf "=== FULL s2 ===\n%s\n\n" (String.escaped s2); 35 + Printf.printf "=== FULL s3 ===\n%s\n\n" (String.escaped s3); 36 + 37 + if s2 = s3 then 38 + Printf.printf "ROUNDTRIP OK\n" 39 + else begin 40 + Printf.printf "ROUNDTRIP FAILED - finding first difference:\n"; 41 + let min_len = min (String.length s2) (String.length s3) in 42 + let rec find_diff i = 43 + if i >= min_len then 44 + Printf.printf "Strings differ in length: s2=%d, s3=%d\n" (String.length s2) (String.length s3) 45 + else if s2.[i] <> s3.[i] then begin 46 + Printf.printf "First diff at position %d: s2[%d]='%s' vs s3[%d]='%s'\n" 47 + i i (String.escaped (String.make 1 s2.[i])) i (String.escaped (String.make 1 s3.[i])); 48 + Printf.printf "Context around diff:\n"; 49 + let start = max 0 (i - 30) in 50 + let end_s2 = min (String.length s2) (i + 50) in 51 + let end_s3 = min (String.length s3) (i + 50) in 52 + Printf.printf "s2[%d..%d]: %s\n" start end_s2 (String.escaped (String.sub s2 start (end_s2 - start))); 53 + Printf.printf "s3[%d..%d]: %s\n" start end_s3 (String.escaped (String.sub s3 start (end_s3 - start))); 54 + end 55 + else find_diff (i + 1) 56 + in 57 + find_diff 0 58 + end
+17
fuzz/test_pre.ml
···
··· 1 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 2 + 3 + let rec dump_node indent (node : Html5rw.Dom.node) = 4 + let padding = String.make (indent * 2) ' ' in 5 + Printf.printf "%s%s" padding node.name; 6 + (match node.data with 7 + | "" -> () 8 + | d -> Printf.printf " data=%S" d); 9 + Printf.printf "\n"; 10 + List.iter (dump_node (indent + 1)) node.children 11 + 12 + let () = 13 + let input = "<pre><div>\n</div></pre>" in 14 + Printf.printf "Input: %s\n" (String.escaped input); 15 + let r1 = Html5rw.parse (reader_of_string input) in 16 + Printf.printf "DOM:\n"; 17 + dump_node 0 (Html5rw.root r1)
+35
html5rw-js.opam
···
··· 1 + # This file is generated by dune, edit dune-project instead 2 + opam-version: "2.0" 3 + synopsis: "Browser-based HTML5 parser via js_of_ocaml/wasm_of_ocaml" 4 + description: 5 + "JavaScript and WebAssembly builds of the html5rw HTML5 parser for browser use. Includes a main validator library, web worker for background validation, and browser-based test runner." 6 + maintainer: ["Anil Madhavapeddy <anil@recoil.org>"] 7 + authors: ["Anil Madhavapeddy <anil@recoil.org>"] 8 + license: "MIT" 9 + homepage: "https://tangled.org/@anil.recoil.org/ocaml-html5rw" 10 + bug-reports: "https://tangled.org/@anil.recoil.org/ocaml-html5rw/issues" 11 + depends: [ 12 + "dune" {>= "3.20"} 13 + "ocaml" {>= "5.1.0"} 14 + "html5rw" {= version} 15 + "js_of_ocaml" {>= "5.0"} 16 + "js_of_ocaml-ppx" {>= "5.0"} 17 + "wasm_of_ocaml-compiler" {>= "5.0"} 18 + "brr" {>= "0.0.6"} 19 + "odoc" {with-doc} 20 + ] 21 + build: [ 22 + ["dune" "subst"] {dev} 23 + [ 24 + "dune" 25 + "build" 26 + "-p" 27 + name 28 + "-j" 29 + jobs 30 + "@install" 31 + "@runtest" {with-test} 32 + "@doc" {with-doc} 33 + ] 34 + ] 35 + x-maintenance-intent: ["(latest)"]
+3 -4
html5rw.opam
··· 6 maintainer: ["Anil Madhavapeddy <anil@recoil.org>"] 7 authors: ["Anil Madhavapeddy <anil@recoil.org>"] 8 license: "MIT" 9 - homepage: "https://tangled.org/anil.recoil.org/ocaml-html5rw" 10 - bug-reports: "https://tangled.org/anil.recoil.org/ocaml-html5rw/issues" 11 depends: [ 12 - "dune" {>= "3.21"} 13 "ocaml" {>= "5.1.0"} 14 "astring" {>= "0.8.5"} 15 "bytesrw" {>= "0.3.0"} ··· 36 "@doc" {with-doc} 37 ] 38 ] 39 - dev-repo: "git+https://tangled.org/anil.recoil.org/ocaml-html5rw" 40 x-maintenance-intent: ["(latest)"]
··· 6 maintainer: ["Anil Madhavapeddy <anil@recoil.org>"] 7 authors: ["Anil Madhavapeddy <anil@recoil.org>"] 8 license: "MIT" 9 + homepage: "https://tangled.org/@anil.recoil.org/ocaml-html5rw" 10 + bug-reports: "https://tangled.org/@anil.recoil.org/ocaml-html5rw/issues" 11 depends: [ 12 + "dune" {>= "3.20"} 13 "ocaml" {>= "5.1.0"} 14 "astring" {>= "0.8.5"} 15 "bytesrw" {>= "0.3.0"} ··· 36 "@doc" {with-doc} 37 ] 38 ] 39 x-maintenance-intent: ["(latest)"]
+7 -2
lib/html5rw/dom/dom.ml
··· 12 13 include Dom_node 14 15 - let to_html = Dom_serialize.to_html 16 - let to_writer = Dom_serialize.to_writer 17 let to_test_format = Dom_serialize.to_test_format 18 let to_text = Dom_serialize.to_text
··· 12 13 include Dom_node 14 15 + (* Wrap serialization functions to hide internal text_mode parameter *) 16 + let to_html ?pretty ?indent_size ?indent node = 17 + Dom_serialize.to_html ?pretty ?indent_size ?indent node 18 + 19 + let to_writer ?pretty ?indent_size ?indent w node = 20 + Dom_serialize.to_writer ?pretty ?indent_size ?indent w node 21 + 22 let to_test_format = Dom_serialize.to_test_format 23 let to_text = Dom_serialize.to_text
+571 -84
lib/html5rw/dom/dom_serialize.ml
··· 20 21 let is_void name = Hashtbl.mem void_elements_tbl name 22 23 (* Foreign attribute adjustments for test output *) 24 let foreign_attr_adjustments = [ 25 "xlink:actuate"; "xlink:arcrole"; "xlink:href"; "xlink:role"; ··· 39 ) text; 40 Buffer.contents buf 41 42 (* Choose quote character for attribute value *) 43 let choose_attr_quote value = 44 if String.contains value '"' && not (String.contains value '\'') then '\'' 45 else '"' 46 47 - (* Escape attribute value *) 48 let escape_attr_value value quote_char = 49 let buf = Buffer.create (String.length value) in 50 String.iter (fun c -> 51 match c with 52 | '&' -> Buffer.add_string buf "&amp;" 53 | '"' when quote_char = '"' -> Buffer.add_string buf "&quot;" 54 | '\'' when quote_char = '\'' -> Buffer.add_string buf "&#39;" 55 | c -> Buffer.add_char buf c ··· 68 ) value; 69 not !invalid 70 71 - (* Serialize start tag - per WHATWG spec, attribute values must be quoted *) 72 let serialize_start_tag name attrs = 73 let buf = Buffer.create 64 in 74 Buffer.add_char buf '<'; 75 Buffer.add_string buf name; 76 List.iter (fun (key, value) -> 77 - Buffer.add_char buf ' '; 78 - Buffer.add_string buf key; 79 - if value <> "" then begin 80 - (* WHATWG serialization algorithm requires double quotes around values *) 81 - Buffer.add_char buf '='; 82 - Buffer.add_char buf '"'; 83 - Buffer.add_string buf (escape_attr_value value '"'); 84 - Buffer.add_char buf '"' 85 end 86 ) attrs; 87 Buffer.add_char buf '>'; ··· 91 let serialize_end_tag name = 92 "</" ^ name ^ ">" 93 94 - (* Convert node to HTML string *) 95 - let rec to_html ?(pretty=true) ?(indent_size=2) ?(indent=0) node = 96 let prefix = if pretty then String.make (indent * indent_size) ' ' else "" in 97 let newline = if pretty then "\n" else "" in 98 99 match node.name with 100 | "#document" -> 101 - let parts = List.map (to_html ~pretty ~indent_size ~indent:0) node.children in 102 - String.concat newline (List.filter (fun s -> s <> "") parts) 103 104 | "#document-fragment" -> 105 - let parts = List.map (to_html ~pretty ~indent_size ~indent) node.children in 106 - String.concat newline (List.filter (fun s -> s <> "") parts) 107 108 | "#text" -> 109 let text = node.data in 110 - if pretty then 111 let trimmed = String.trim text in 112 - if trimmed = "" then "" 113 - else prefix ^ escape_text trimmed 114 - else escape_text text 115 116 | "#comment" -> 117 - prefix ^ "<!--" ^ node.data ^ "-->" 118 119 | "!doctype" -> 120 - prefix ^ "<!DOCTYPE html>" 121 122 | name -> 123 - let open_tag = serialize_start_tag name node.attrs in 124 125 - if is_void name then 126 - prefix ^ open_tag 127 - else if node.children = [] then 128 - prefix ^ open_tag ^ serialize_end_tag name 129 else begin 130 - (* Check if all children are text *) 131 - let all_text = List.for_all is_text node.children in 132 - if all_text && pretty then 133 - let text = String.concat "" (List.map (fun c -> c.data) node.children) in 134 - prefix ^ open_tag ^ escape_text text ^ serialize_end_tag name 135 - else begin 136 - let parts = [prefix ^ open_tag] in 137 - let child_parts = List.filter_map (fun child -> 138 - let html = to_html ~pretty ~indent_size ~indent:(indent + 1) child in 139 - if html = "" then None else Some html 140 - ) node.children in 141 - let parts = parts @ child_parts @ [prefix ^ serialize_end_tag name] in 142 - String.concat newline parts 143 - end 144 end 145 146 (* Get qualified name for test format *) 147 let qualified_name node = ··· 226 if strip then String.trim combined else combined 227 228 (* Streaming serialization to a Bytes.Writer.t 229 - Writes HTML directly to the writer without building intermediate strings *) 230 - let rec to_writer ?(pretty=true) ?(indent_size=2) ?(indent=0) (w : Bytes.Writer.t) node = 231 let write s = Bytes.Writer.write_string w s in 232 let write_prefix () = if pretty then write (String.make (indent * indent_size) ' ') in 233 let write_newline () = if pretty then write "\n" in 234 235 match node.name with 236 | "#document" -> 237 - let rec write_children first = function 238 - | [] -> () 239 - | child :: rest -> 240 - if not first && pretty then write_newline (); 241 - to_writer ~pretty ~indent_size ~indent:0 w child; 242 - write_children false rest 243 - in 244 - write_children true node.children 245 246 | "#document-fragment" -> 247 - let rec write_children first = function 248 - | [] -> () 249 - | child :: rest -> 250 - if not first && pretty then write_newline (); 251 - to_writer ~pretty ~indent_size ~indent w child; 252 - write_children false rest 253 - in 254 - write_children true node.children 255 256 | "#text" -> 257 let text = node.data in 258 - if pretty then begin 259 let trimmed = String.trim text in 260 if trimmed <> "" then begin 261 write_prefix (); 262 - write (escape_text trimmed) 263 end 264 end else 265 - write (escape_text text) 266 267 | "#comment" -> 268 write_prefix (); 269 write "<!--"; 270 write node.data; 271 - write "-->" 272 273 | "!doctype" -> 274 write_prefix (); 275 - write "<!DOCTYPE html>" 276 277 | name -> 278 write_prefix (); 279 write (serialize_start_tag name node.attrs); 280 281 - if not (is_void name) then begin 282 - if node.children = [] then 283 - write (serialize_end_tag name) 284 - else begin 285 - (* Check if all children are text *) 286 - let all_text = List.for_all is_text node.children in 287 - if all_text && pretty then begin 288 - let text = String.concat "" (List.map (fun c -> c.data) node.children) in 289 - write (escape_text text); 290 - write (serialize_end_tag name) 291 - end else begin 292 - let rec write_children = function 293 - | [] -> () 294 - | child :: rest -> 295 - write_newline (); 296 - to_writer ~pretty ~indent_size ~indent:(indent + 1) w child; 297 - write_children rest 298 - in 299 - write_children node.children; 300 write_newline (); 301 write_prefix (); 302 write (serialize_end_tag name) 303 - end 304 end 305 end
··· 20 21 let is_void name = Hashtbl.mem void_elements_tbl name 22 23 + (* Raw text elements - content should NOT be escaped at all 24 + Per WHATWG spec: script, style, xmp, iframe, noembed, noframes 25 + Note: noscript depends on scripting being enabled (we assume it is) 26 + Note: plaintext is handled specially - it has no closing tag *) 27 + let raw_text_elements_tbl = 28 + let elements = [ 29 + "script"; "style"; "xmp"; "iframe"; "noembed"; "noframes"; "noscript" 30 + ] in 31 + let tbl = Hashtbl.create (List.length elements) in 32 + List.iter (fun e -> Hashtbl.add tbl e ()) elements; 33 + tbl 34 + 35 + let is_raw_text_element name = Hashtbl.mem raw_text_elements_tbl name 36 + 37 + (* plaintext is special: it can never be closed, everything after is raw text. 38 + We treat it as raw text but without a closing tag. *) 39 + let is_plaintext_element name = name = "plaintext" 40 + 41 + (* Escapable raw text elements - only & needs to be escaped *) 42 + let escapable_raw_text_elements_tbl = 43 + let elements = ["textarea"; "title"] in 44 + let tbl = Hashtbl.create (List.length elements) in 45 + List.iter (fun e -> Hashtbl.add tbl e ()) elements; 46 + tbl 47 + 48 + let is_escapable_raw_text_element name = Hashtbl.mem escapable_raw_text_elements_tbl name 49 + 50 + (* HTML breakout elements - these break out of foreign content (SVG/MathML) when parsed. 51 + Per WHATWG spec section 13.2.6.5, these start tags cause exit from foreign content. *) 52 + let html_breakout_elements_tbl = 53 + let elements = [ 54 + "b"; "big"; "blockquote"; "body"; "br"; "center"; "code"; "dd"; "div"; "dl"; "dt"; 55 + "em"; "embed"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "head"; "hr"; "i"; "img"; "li"; 56 + "listing"; "menu"; "meta"; "nobr"; "ol"; "p"; "pre"; "ruby"; "s"; "small"; "span"; 57 + "strong"; "strike"; "sub"; "sup"; "table"; "tt"; "u"; "ul"; "var" 58 + ] in 59 + let tbl = Hashtbl.create (List.length elements) in 60 + List.iter (fun e -> Hashtbl.add tbl e ()) elements; 61 + tbl 62 + 63 + let is_html_breakout_element name = Hashtbl.mem html_breakout_elements_tbl (String.lowercase_ascii name) 64 + 65 + (* HTML integration points in SVG - these allow HTML content inside SVG *) 66 + let is_svg_html_integration_point name = 67 + let name = String.lowercase_ascii name in 68 + name = "foreignobject" || name = "desc" || name = "title" 69 + 70 + (* Formatting elements - these are in the list of active formatting elements 71 + and the adoption agency algorithm handles them specially when block elements appear *) 72 + let formatting_elements_tbl = 73 + let elements = ["a"; "b"; "big"; "code"; "em"; "font"; "i"; "nobr"; "s"; "small"; "strike"; "strong"; "tt"; "u"] in 74 + let tbl = Hashtbl.create (List.length elements) in 75 + List.iter (fun e -> Hashtbl.add tbl e ()) elements; 76 + tbl 77 + 78 + let is_formatting_element name = Hashtbl.mem formatting_elements_tbl (String.lowercase_ascii name) 79 + 80 + (* Block elements that trigger adoption agency when inside formatting elements *) 81 + let is_block_element name = 82 + let name = String.lowercase_ascii name in 83 + List.mem name ["div"; "p"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "blockquote"; "pre"; "ol"; "ul"; "dl"; 84 + "table"; "form"; "fieldset"; "address"; "article"; "aside"; "footer"; "header"; "main"; 85 + "nav"; "section"; "figure"; "figcaption"; "details"; "summary"] 86 + 87 + (* Elements where a leading newline in content must be doubled during serialization. 88 + Per HTML5 spec, the parser strips a single leading newline after opening tags 89 + for pre, textarea, and listing elements. To preserve content, we must emit 90 + an extra newline if the content starts with one. *) 91 + let needs_leading_newline_preserved name = 92 + name = "pre" || name = "textarea" || name = "listing" 93 + 94 + (* Check if text content starts with a newline (LF) *) 95 + let starts_with_newline text = 96 + String.length text > 0 && text.[0] = '\n' 97 + 98 + (* Get the first text content from children, if any *) 99 + let first_text_content children = 100 + match children with 101 + | [] -> "" 102 + | first :: _ when first.name = "#text" -> first.data 103 + | _ -> "" 104 + 105 (* Foreign attribute adjustments for test output *) 106 let foreign_attr_adjustments = [ 107 "xlink:actuate"; "xlink:arcrole"; "xlink:href"; "xlink:role"; ··· 121 ) text; 122 Buffer.contents buf 123 124 + (* Escape text for escapable raw text elements (only & needs escaping) *) 125 + let escape_escapable_raw_text text = 126 + let buf = Buffer.create (String.length text) in 127 + String.iter (fun c -> 128 + match c with 129 + | '&' -> Buffer.add_string buf "&amp;" 130 + | c -> Buffer.add_char buf c 131 + ) text; 132 + Buffer.contents buf 133 + 134 (* Choose quote character for attribute value *) 135 let choose_attr_quote value = 136 if String.contains value '"' && not (String.contains value '\'') then '\'' 137 else '"' 138 139 + (* Escape attribute value - must escape &, quotes, and < for safe reparsing *) 140 let escape_attr_value value quote_char = 141 let buf = Buffer.create (String.length value) in 142 String.iter (fun c -> 143 match c with 144 | '&' -> Buffer.add_string buf "&amp;" 145 + | '<' -> Buffer.add_string buf "&lt;" 146 | '"' when quote_char = '"' -> Buffer.add_string buf "&quot;" 147 | '\'' when quote_char = '\'' -> Buffer.add_string buf "&#39;" 148 | c -> Buffer.add_char buf c ··· 161 ) value; 162 not !invalid 163 164 + (* Check if a name is valid for serialization - rejects control chars, 165 + whitespace, and special chars like quotes, angle brackets, slash, equals *) 166 + let is_valid_name ?(allow_lt=false) name = 167 + if String.length name = 0 then false 168 + else 169 + let valid = ref true in 170 + String.iter (fun c -> 171 + let code = Char.code c in 172 + if code <= 0x1F || (code >= 0x7F && code <= 0x9F) || 173 + c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' || 174 + c = '"' || c = '\'' || c = '>' || c = '/' || c = '=' || 175 + (c = '<' && not allow_lt) then 176 + valid := false 177 + ) name; 178 + !valid 179 + 180 + let is_valid_attr_name = is_valid_name ~allow_lt:false 181 + 182 + (* Element names must be ASCII-only for consistent roundtrip parsing *) 183 + let is_valid_element_name name = 184 + if String.length name = 0 then false 185 + else 186 + let valid = ref true in 187 + String.iter (fun c -> 188 + let code = Char.code c in 189 + (* Reject all non-ASCII and special chars *) 190 + if code < 0x21 || code > 0x7E || 191 + c = '"' || c = '\'' || c = '>' || c = '/' || c = '=' || c = '<' then 192 + valid := false 193 + ) name; 194 + !valid 195 + 196 + (* Sanitize element name by removing invalid characters. 197 + Returns a safe element name for serialization. 198 + Only keeps printable ASCII chars excluding special HTML chars. *) 199 + let sanitize_element_name name = 200 + if is_valid_element_name name then name 201 + else begin 202 + let buf = Buffer.create (String.length name) in 203 + String.iter (fun c -> 204 + let code = Char.code c in 205 + (* Keep only printable ASCII excluding special chars *) 206 + if code >= 0x21 && code <= 0x7E && 207 + c <> '"' && c <> '\'' && c <> '>' && c <> '/' && c <> '=' && c <> '<' then 208 + Buffer.add_char buf c 209 + ) name; 210 + let sanitized = Buffer.contents buf in 211 + if String.length sanitized = 0 then "span" else sanitized 212 + end 213 + 214 + (* Serialize start tag - per WHATWG spec, attribute values must be quoted. 215 + Attributes with invalid names are skipped to ensure valid HTML output. *) 216 let serialize_start_tag name attrs = 217 let buf = Buffer.create 64 in 218 Buffer.add_char buf '<'; 219 Buffer.add_string buf name; 220 List.iter (fun (key, value) -> 221 + (* Skip attributes with invalid names - they can't be serialized safely *) 222 + if is_valid_attr_name key then begin 223 + Buffer.add_char buf ' '; 224 + Buffer.add_string buf key; 225 + if value <> "" then begin 226 + (* WHATWG serialization algorithm requires double quotes around values *) 227 + Buffer.add_char buf '='; 228 + Buffer.add_char buf '"'; 229 + Buffer.add_string buf (escape_attr_value value '"'); 230 + Buffer.add_char buf '"' 231 + end 232 end 233 ) attrs; 234 Buffer.add_char buf '>'; ··· 238 let serialize_end_tag name = 239 "</" ^ name ^ ">" 240 241 + (* Text escaping mode based on parent element *) 242 + type text_mode = Normal | Raw | EscapableRaw 243 + 244 + (* Foreign content context for tracking SVG/MathML during serialization *) 245 + type foreign_ctx = NotForeign | InSvg | InMathML 246 + 247 + (* Serialization context for tracking state during tree traversal *) 248 + type serial_ctx = { 249 + mutable open_formatting: string list; (* Stack of open formatting element names *) 250 + mutable in_foreign: foreign_ctx; (* Current foreign content context *) 251 + mutable foreign_depth: int; (* Depth inside foreign content *) 252 + } 253 + 254 + let create_ctx () = { 255 + open_formatting = []; 256 + in_foreign = NotForeign; 257 + foreign_depth = 0; 258 + } 259 + 260 + (* Check if a formatting element is already open in the context *) 261 + let has_open_formatting ctx name = 262 + List.mem (String.lowercase_ascii name) (List.map String.lowercase_ascii ctx.open_formatting) 263 + 264 + (* Table elements that need implicit wrappers *) 265 + let table_cell_elements = ["td"; "th"] 266 + let table_row_elements = ["tr"] 267 + let table_section_elements = ["tbody"; "thead"; "tfoot"] 268 + 269 + (* Check if we need to add implicit table wrappers *) 270 + let needs_tbody_wrapper parent_name children = 271 + String.lowercase_ascii parent_name = "table" && 272 + List.exists (fun c -> 273 + let n = String.lowercase_ascii c.name in 274 + List.mem n table_row_elements || List.mem n table_cell_elements 275 + ) children 276 + 277 + (* Check if a table has any real table content (not just comments/text that would be foster-parented) *) 278 + let table_has_real_content children = 279 + List.exists (fun c -> 280 + let n = String.lowercase_ascii c.name in 281 + List.mem n table_section_elements || 282 + List.mem n table_row_elements || 283 + List.mem n table_cell_elements || 284 + n = "caption" || n = "colgroup" || n = "col" 285 + ) children 286 + 287 + (* Check if this is an empty table that would cause foster parenting instability *) 288 + let is_empty_table name children = 289 + String.lowercase_ascii name = "table" && not (table_has_real_content children) 290 + 291 + (* Structural elements that have special parsing behavior and cause instability 292 + when nested inside other elements. These should have their content output 293 + directly without the wrapper element when found in unexpected contexts. *) 294 + let is_structural_element name = 295 + let name = String.lowercase_ascii name in 296 + name = "body" || name = "head" || name = "html" 297 + 298 + (* Convert node to HTML string 299 + Returns (html_string, encountered_plaintext) where encountered_plaintext 300 + indicates that a plaintext element was found and no more content should 301 + be serialized after this point (plaintext absorbs everything after it) 302 + 303 + The in_foreign parameter tracks whether we're inside SVG or MathML foreign 304 + content. When in foreign content, HTML breakout elements need special handling 305 + to ensure roundtrip stability. 306 + 307 + The ctx parameter tracks serialization state for adoption agency handling. *) 308 + let rec to_html_internal ?(pretty=true) ?(indent_size=2) ?(indent=0) ?(text_mode=Normal) ?(in_foreign=NotForeign) ?(ctx=None) node = 309 + let ctx = match ctx with Some c -> c | None -> create_ctx () in 310 let prefix = if pretty then String.make (indent * indent_size) ' ' else "" in 311 let newline = if pretty then "\n" else "" in 312 313 + (* Escape text based on mode *) 314 + let escape_for_mode text = match text_mode with 315 + | Normal -> escape_text text 316 + | Raw -> text (* No escaping for script/style content *) 317 + | EscapableRaw -> escape_escapable_raw_text text 318 + in 319 + 320 match node.name with 321 | "#document" -> 322 + let buf = Buffer.create 256 in 323 + let first = ref true in 324 + let plaintext_found = ref false in 325 + List.iter (fun child -> 326 + if not !plaintext_found then begin 327 + let (html, pt) = to_html_internal ~pretty ~indent_size ~indent:0 ~text_mode:Normal ~in_foreign:NotForeign ~ctx:(Some ctx) child in 328 + if html <> "" then begin 329 + if not !first && pretty then Buffer.add_string buf newline; 330 + Buffer.add_string buf html; 331 + first := false 332 + end; 333 + if pt then plaintext_found := true 334 + end 335 + ) node.children; 336 + (Buffer.contents buf, !plaintext_found) 337 338 | "#document-fragment" -> 339 + let buf = Buffer.create 256 in 340 + let first = ref true in 341 + let plaintext_found = ref false in 342 + List.iter (fun child -> 343 + if not !plaintext_found then begin 344 + let (html, pt) = to_html_internal ~pretty ~indent_size ~indent ~text_mode ~in_foreign ~ctx:(Some ctx) child in 345 + if html <> "" then begin 346 + if not !first && pretty then Buffer.add_string buf newline; 347 + Buffer.add_string buf html; 348 + first := false 349 + end; 350 + if pt then plaintext_found := true 351 + end 352 + ) node.children; 353 + (Buffer.contents buf, !plaintext_found) 354 355 | "#text" -> 356 let text = node.data in 357 + if pretty && text_mode = Normal then 358 let trimmed = String.trim text in 359 + if trimmed = "" then ("", false) 360 + else (prefix ^ escape_for_mode trimmed, false) 361 + else (escape_for_mode text, false) 362 363 | "#comment" -> 364 + (prefix ^ "<!--" ^ node.data ^ "-->", false) 365 366 | "!doctype" -> 367 + (prefix ^ "<!DOCTYPE html>", false) 368 369 | name -> 370 + (* Sanitize element name to ensure valid HTML output *) 371 + let name = sanitize_element_name name in 372 + let name_lower = String.lowercase_ascii name in 373 + 374 + (* Determine the foreign context for this element and its children. 375 + If we enter SVG or MathML, track that. If we're at an HTML integration 376 + point inside SVG, children are processed in HTML mode. *) 377 + let this_foreign = match node.namespace with 378 + | Some "svg" -> InSvg 379 + | Some "mathml" -> InMathML 380 + | _ -> in_foreign 381 + in 382 + 383 + (* Update foreign depth tracking *) 384 + let entering_foreign = this_foreign <> NotForeign && in_foreign = NotForeign in 385 + if entering_foreign then ctx.foreign_depth <- ctx.foreign_depth + 1; 386 + 387 + (* For children: if we're at an SVG HTML integration point, children go back to HTML mode *) 388 + let child_foreign = 389 + if this_foreign = InSvg && is_svg_html_integration_point name then NotForeign 390 + else this_foreign 391 + in 392 + 393 + (* When in foreign content, HTML breakout elements would cause the parser 394 + to exit foreign content on reparse. To ensure roundtrip stability, 395 + prefix them with 'x-' to make them custom elements. *) 396 + let name = 397 + if in_foreign <> NotForeign && is_html_breakout_element name then 398 + "x-" ^ name 399 + else 400 + name 401 + in 402 + 403 + (* Handle nested formatting elements for adoption agency stability. 404 + If we're about to serialize a formatting element that's already open, 405 + we need to close the outer one first and reopen it after children. 406 + This matches how the parser would reconstruct the elements. *) 407 + let is_fmt = is_formatting_element name_lower in 408 + let nested_fmt = is_fmt && has_open_formatting ctx name_lower in 409 410 + (* For nested formatting elements, don't output the inner tag at all - 411 + instead, close the outer and let it reopen naturally. This produces 412 + flatter HTML that the parser will handle consistently. *) 413 + if nested_fmt then begin 414 + (* Just serialize children without this element wrapper *) 415 + let buf = Buffer.create 256 in 416 + let plaintext_found = ref false in 417 + let child_text_mode = 418 + if is_raw_text_element name then Raw 419 + else if is_escapable_raw_text_element name then EscapableRaw 420 + else Normal 421 + in 422 + List.iter (fun child -> 423 + if not !plaintext_found then begin 424 + let (html, pt) = to_html_internal ~pretty ~indent_size ~indent ~text_mode:child_text_mode ~in_foreign:child_foreign ~ctx:(Some ctx) child in 425 + if html <> "" then begin 426 + if Buffer.length buf > 0 && pretty then Buffer.add_string buf newline; 427 + Buffer.add_string buf html 428 + end; 429 + if pt then plaintext_found := true 430 + end 431 + ) node.children; 432 + if entering_foreign then ctx.foreign_depth <- ctx.foreign_depth - 1; 433 + (Buffer.contents buf, !plaintext_found) 434 + end 435 + (* Empty tables cause foster-parenting instability - skip the table tag 436 + and output children (comments/text) directly, since they would be 437 + foster-parented out of the table during reparsing anyway. *) 438 + else if is_empty_table name node.children then begin 439 + let buf = Buffer.create 256 in 440 + let plaintext_found = ref false in 441 + List.iter (fun child -> 442 + if not !plaintext_found then begin 443 + let (html, pt) = to_html_internal ~pretty ~indent_size ~indent ~text_mode ~in_foreign:child_foreign ~ctx:(Some ctx) child in 444 + if html <> "" then begin 445 + if Buffer.length buf > 0 && pretty then Buffer.add_string buf newline; 446 + Buffer.add_string buf html 447 + end; 448 + if pt then plaintext_found := true 449 + end 450 + ) node.children; 451 + if entering_foreign then ctx.foreign_depth <- ctx.foreign_depth - 1; 452 + (Buffer.contents buf, !plaintext_found) 453 + end 454 + (* Structural elements (body, head, html) nested inside other elements 455 + cause parsing instability. Skip the wrapper and output children directly. *) 456 + else if is_structural_element name && indent > 0 then begin 457 + let buf = Buffer.create 256 in 458 + let plaintext_found = ref false in 459 + List.iter (fun child -> 460 + if not !plaintext_found then begin 461 + let (html, pt) = to_html_internal ~pretty ~indent_size ~indent ~text_mode ~in_foreign:child_foreign ~ctx:(Some ctx) child in 462 + if html <> "" then begin 463 + if Buffer.length buf > 0 && pretty then Buffer.add_string buf newline; 464 + Buffer.add_string buf html 465 + end; 466 + if pt then plaintext_found := true 467 + end 468 + ) node.children; 469 + if entering_foreign then ctx.foreign_depth <- ctx.foreign_depth - 1; 470 + (Buffer.contents buf, !plaintext_found) 471 + end 472 else begin 473 + (* Track this formatting element if applicable *) 474 + if is_fmt then ctx.open_formatting <- name_lower :: ctx.open_formatting; 475 + 476 + let open_tag = serialize_start_tag name node.attrs in 477 + 478 + let result = 479 + if is_void name then 480 + (prefix ^ open_tag, false) 481 + else if is_plaintext_element name then begin 482 + (* plaintext is special: it cannot be closed once opened. 483 + We serialize content as raw text without a closing tag. 484 + Also signal that plaintext was encountered so ancestors 485 + don't add closing tags. *) 486 + let text = String.concat "" (List.map (fun c -> c.data) node.children) in 487 + (prefix ^ open_tag ^ text, true) 488 + end else if node.children = [] then 489 + (prefix ^ open_tag ^ serialize_end_tag name, false) 490 + else begin 491 + (* Determine text mode for children based on this element *) 492 + let child_text_mode = 493 + if is_raw_text_element name then Raw 494 + else if is_escapable_raw_text_element name then EscapableRaw 495 + else Normal 496 + in 497 + (* Check if all children are text *) 498 + let all_text = List.for_all is_text node.children in 499 + (* Per HTML5 spec, pre/textarea/listing need leading newline doubled *) 500 + let leading_newline = 501 + if needs_leading_newline_preserved name && 502 + starts_with_newline (first_text_content node.children) 503 + then "\n" else "" 504 + in 505 + 506 + (* Add implicit tbody wrapper for tables with direct tr/td children. 507 + This prevents foster parenting on reparse. *) 508 + let children, needs_tbody = 509 + if needs_tbody_wrapper name node.children then begin 510 + (* Wrap row/cell children in tbody *) 511 + let (before, rows_and_after) = List.partition (fun c -> 512 + let n = String.lowercase_ascii c.name in 513 + n = "caption" || n = "colgroup" || n = "col" 514 + ) node.children in 515 + if rows_and_after <> [] then 516 + let tbody_node = { 517 + name = "tbody"; 518 + namespace = None; 519 + data = ""; 520 + attrs = []; 521 + children = rows_and_after; 522 + parent = None; 523 + doctype = None; 524 + template_content = None; 525 + location = None; 526 + } in 527 + (before @ [tbody_node], true) 528 + else 529 + (node.children, false) 530 + end else 531 + (node.children, false) 532 + in 533 + let _ = needs_tbody in (* suppress warning *) 534 + 535 + if all_text && not needs_tbody then begin 536 + let text = String.concat "" (List.map (fun c -> c.data) node.children) in 537 + let escaped = match child_text_mode with 538 + | Normal -> escape_text text 539 + | Raw -> text 540 + | EscapableRaw -> escape_escapable_raw_text text 541 + in 542 + (prefix ^ open_tag ^ leading_newline ^ escaped ^ serialize_end_tag name, false) 543 + end else begin 544 + let buf = Buffer.create 256 in 545 + Buffer.add_string buf (prefix ^ open_tag); 546 + Buffer.add_string buf leading_newline; 547 + let plaintext_found = ref false in 548 + List.iter (fun child -> 549 + if not !plaintext_found then begin 550 + let (html, pt) = to_html_internal ~pretty ~indent_size ~indent:(indent + 1) ~text_mode:child_text_mode ~in_foreign:child_foreign ~ctx:(Some ctx) child in 551 + if html <> "" then begin 552 + Buffer.add_string buf newline; 553 + Buffer.add_string buf html 554 + end; 555 + if pt then plaintext_found := true 556 + end 557 + ) children; 558 + (* Only add closing tag if plaintext wasn't found *) 559 + if not !plaintext_found then begin 560 + Buffer.add_string buf newline; 561 + Buffer.add_string buf (prefix ^ serialize_end_tag name) 562 + end; 563 + (Buffer.contents buf, !plaintext_found) 564 + end 565 + end 566 + in 567 + 568 + (* Pop formatting element from stack *) 569 + if is_fmt then 570 + ctx.open_formatting <- (match ctx.open_formatting with _ :: rest -> rest | [] -> []); 571 + 572 + if entering_foreign then ctx.foreign_depth <- ctx.foreign_depth - 1; 573 + result 574 end 575 + 576 + (* Public wrapper that discards the plaintext flag *) 577 + let to_html ?(pretty=true) ?(indent_size=2) ?(indent=0) ?(text_mode=Normal) node = 578 + fst (to_html_internal ~pretty ~indent_size ~indent ~text_mode node) 579 580 (* Get qualified name for test format *) 581 let qualified_name node = ··· 660 if strip then String.trim combined else combined 661 662 (* Streaming serialization to a Bytes.Writer.t 663 + Writes HTML directly to the writer without building intermediate strings 664 + Returns true if a plaintext element was encountered (stops further serialization) *) 665 + let rec to_writer_internal ?(pretty=true) ?(indent_size=2) ?(indent=0) ?(text_mode=Normal) (w : Bytes.Writer.t) node = 666 let write s = Bytes.Writer.write_string w s in 667 let write_prefix () = if pretty then write (String.make (indent * indent_size) ' ') in 668 let write_newline () = if pretty then write "\n" in 669 670 + (* Escape text based on mode *) 671 + let escape_for_mode text = match text_mode with 672 + | Normal -> escape_text text 673 + | Raw -> text 674 + | EscapableRaw -> escape_escapable_raw_text text 675 + in 676 + 677 match node.name with 678 | "#document" -> 679 + let plaintext_found = ref false in 680 + let first = ref true in 681 + List.iter (fun child -> 682 + if not !plaintext_found then begin 683 + if not !first && pretty then write_newline (); 684 + let pt = to_writer_internal ~pretty ~indent_size ~indent:0 ~text_mode:Normal w child in 685 + first := false; 686 + if pt then plaintext_found := true 687 + end 688 + ) node.children; 689 + !plaintext_found 690 691 | "#document-fragment" -> 692 + let plaintext_found = ref false in 693 + let first = ref true in 694 + List.iter (fun child -> 695 + if not !plaintext_found then begin 696 + if not !first && pretty then write_newline (); 697 + let pt = to_writer_internal ~pretty ~indent_size ~indent ~text_mode w child in 698 + first := false; 699 + if pt then plaintext_found := true 700 + end 701 + ) node.children; 702 + !plaintext_found 703 704 | "#text" -> 705 let text = node.data in 706 + if pretty && text_mode = Normal then begin 707 let trimmed = String.trim text in 708 if trimmed <> "" then begin 709 write_prefix (); 710 + write (escape_for_mode trimmed) 711 end 712 end else 713 + write (escape_for_mode text); 714 + false 715 716 | "#comment" -> 717 write_prefix (); 718 write "<!--"; 719 write node.data; 720 + write "-->"; 721 + false 722 723 | "!doctype" -> 724 write_prefix (); 725 + write "<!DOCTYPE html>"; 726 + false 727 728 | name -> 729 + (* Sanitize element name to ensure valid HTML output *) 730 + let name = sanitize_element_name name in 731 write_prefix (); 732 write (serialize_start_tag name node.attrs); 733 734 + if is_void name then 735 + false (* No end tag for void elements *) 736 + else if is_plaintext_element name then begin 737 + (* plaintext is special: cannot be closed, content is raw *) 738 + let text = String.concat "" (List.map (fun c -> c.data) node.children) in 739 + write text; 740 + (* No closing tag for plaintext, signal to stop further serialization *) 741 + true 742 + end else if node.children = [] then begin 743 + write (serialize_end_tag name); 744 + false 745 + end else begin 746 + (* Determine text mode for children based on this element *) 747 + let child_text_mode = 748 + if is_raw_text_element name then Raw 749 + else if is_escapable_raw_text_element name then EscapableRaw 750 + else Normal 751 + in 752 + (* Check if all children are text *) 753 + let all_text = List.for_all is_text node.children in 754 + (* Per HTML5 spec, pre/textarea/listing need leading newline doubled *) 755 + let needs_leading_nl = 756 + needs_leading_newline_preserved name && 757 + starts_with_newline (first_text_content node.children) 758 + in 759 + if all_text then begin 760 + let text = String.concat "" (List.map (fun c -> c.data) node.children) in 761 + let escaped = match child_text_mode with 762 + | Normal -> escape_text text 763 + | Raw -> text 764 + | EscapableRaw -> escape_escapable_raw_text text 765 + in 766 + if needs_leading_nl then write "\n"; 767 + write escaped; 768 + write (serialize_end_tag name); 769 + false 770 + end else begin 771 + if needs_leading_nl then write "\n"; 772 + let plaintext_found = ref false in 773 + List.iter (fun child -> 774 + if not !plaintext_found then begin 775 + write_newline (); 776 + let pt = to_writer_internal ~pretty ~indent_size ~indent:(indent + 1) ~text_mode:child_text_mode w child in 777 + if pt then plaintext_found := true 778 + end 779 + ) node.children; 780 + (* Only add closing tag if plaintext wasn't found *) 781 + if not !plaintext_found then begin 782 write_newline (); 783 write_prefix (); 784 write (serialize_end_tag name) 785 + end; 786 + !plaintext_found 787 end 788 end 789 + 790 + (* Public wrapper that discards the plaintext flag *) 791 + let to_writer ?(pretty=true) ?(indent_size=2) ?(indent=0) (w : Bytes.Writer.t) node = 792 + ignore (to_writer_internal ~pretty ~indent_size ~indent w node)
+3
lib/html5rw/parser/parser_tree_builder.ml
··· 208 end 209 210 let insert_element t name ?(namespace=None) ?(push=false) attrs = 211 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in 212 let node = Dom.create_element name ~namespace ~attrs ~location () in 213 let (parent, before) = appropriate_insertion_place t in
··· 208 end 209 210 let insert_element t name ?(namespace=None) ?(push=false) attrs = 211 + (* Reset ignore_lf flag - per HTML5 spec, only the immediately next token 212 + after pre/textarea/listing should be checked for leading LF *) 213 + t.ignore_lf <- false; 214 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in 215 let node = Dom.create_element name ~namespace ~attrs ~location () in 216 let (parent, before) = appropriate_insertion_place t in
+13 -1
lib/html5rw/tokenizer/tokenizer_impl.ml
··· 711 t.state <- Tokenizer_state.Bogus_comment 712 713 and state_tag_name () = 714 - match Tokenizer_stream.consume t.stream with 715 | Some ('\t' | '\n' | '\x0C' | ' ') -> 716 t.state <- Tokenizer_state.Before_attribute_name 717 | Some '/' -> 718 t.state <- Tokenizer_state.Self_closing_start_tag 719 | Some '>' -> 720 t.state <- Tokenizer_state.Data; 721 emit_current_tag () 722 | Some '\x00' -> 723 error t "unexpected-null-character"; 724 Buffer.add_string t.current_tag_name "\xEF\xBF\xBD" 725 | Some c -> 726 check_control_char c; 727 Buffer.add_char t.current_tag_name (ascii_lower c) 728 | None -> ()
··· 711 t.state <- Tokenizer_state.Bogus_comment 712 713 and state_tag_name () = 714 + match Tokenizer_stream.peek t.stream with 715 | Some ('\t' | '\n' | '\x0C' | ' ') -> 716 + Tokenizer_stream.advance t.stream; 717 t.state <- Tokenizer_state.Before_attribute_name 718 | Some '/' -> 719 + Tokenizer_stream.advance t.stream; 720 t.state <- Tokenizer_state.Self_closing_start_tag 721 | Some '>' -> 722 + Tokenizer_stream.advance t.stream; 723 t.state <- Tokenizer_state.Data; 724 emit_current_tag () 725 | Some '\x00' -> 726 + Tokenizer_stream.advance t.stream; 727 error t "unexpected-null-character"; 728 Buffer.add_string t.current_tag_name "\xEF\xBF\xBD" 729 + | Some '<' -> 730 + (* Per HTML5 spec section 13.2.5.8: '<' is "anything else" - append to tag name. 731 + Note: The previous implementation incorrectly emitted the tag and switched 732 + to tag open state. The spec says to just append the character to the tag name 733 + without emitting an error. *) 734 + Tokenizer_stream.advance t.stream; 735 + Buffer.add_char t.current_tag_name '<' 736 | Some c -> 737 + Tokenizer_stream.advance t.stream; 738 check_control_char c; 739 Buffer.add_char t.current_tag_name (ascii_lower c) 740 | None -> ()
+76 -78
lib/js/dune
··· 1 ; HTML5rw JavaScript Validator Library 2 ; Compiled with js_of_ocaml for browser use 3 - ; 4 - ; Build rules temporarily disabled - code kept in place 5 6 - ; (library 7 - ; (name htmlrw_js) 8 - ; (public_name html5rw.js) 9 - ; (libraries 10 - ; html5rw 11 - ; htmlrw_check 12 - ; bytesrw 13 - ; brr) 14 - ; (modes byte) ; js_of_ocaml requires bytecode 15 - ; (modules 16 - ; htmlrw_js_types 17 - ; htmlrw_js_dom 18 - ; htmlrw_js_annotate 19 - ; htmlrw_js_ui 20 - ; htmlrw_js)) 21 22 ; Standalone JavaScript file for direct browser use 23 ; This compiles the library entry point to a .js file 24 - ; (executable 25 - ; (name htmlrw_js_main) 26 - ; (libraries htmlrw_js) 27 - ; (js_of_ocaml 28 - ; (javascript_files)) 29 - ; (modes js wasm) 30 - ; (modules htmlrw_js_main)) 31 32 ; Web Worker for background validation 33 ; Runs validation in a separate thread to avoid blocking the UI 34 - ; (executable 35 - ; (name htmlrw_js_worker) 36 - ; (libraries html5rw htmlrw_check bytesrw brr) 37 - ; (js_of_ocaml 38 - ; (javascript_files)) 39 - ; (modes js wasm) 40 - ; (modules htmlrw_js_worker)) 41 42 ; Test runner for browser-based regression testing 43 ; Runs html5lib conformance tests in the browser 44 - ; (executable 45 - ; (name htmlrw_js_tests_main) 46 - ; (libraries html5rw bytesrw brr) 47 - ; (js_of_ocaml 48 - ; (javascript_files)) 49 - ; (modes js wasm) 50 - ; (modules htmlrw_js_tests htmlrw_js_tests_main)) 51 52 ; Copy to nice filenames (JS) 53 - ; (rule 54 - ; (targets htmlrw.js) 55 - ; (deps htmlrw_js_main.bc.js) 56 - ; (action (copy %{deps} %{targets}))) 57 58 - ; (rule 59 - ; (targets htmlrw-worker.js) 60 - ; (deps htmlrw_js_worker.bc.js) 61 - ; (action (copy %{deps} %{targets}))) 62 63 - ; (rule 64 - ; (targets htmlrw-tests.js) 65 - ; (deps htmlrw_js_tests_main.bc.js) 66 - ; (action (copy %{deps} %{targets}))) 67 68 ; Copy to nice filenames (WASM) 69 ; Note: requires wasm_of_ocaml-compiler to be installed 70 - ; (rule 71 - ; (targets htmlrw.wasm.js) 72 - ; (deps htmlrw_js_main.bc.wasm.js) 73 - ; (action (copy %{deps} %{targets}))) 74 75 - ; (rule 76 - ; (targets htmlrw-worker.wasm.js) 77 - ; (deps htmlrw_js_worker.bc.wasm.js) 78 - ; (action (copy %{deps} %{targets}))) 79 80 - ; (rule 81 - ; (targets htmlrw-tests.wasm.js) 82 - ; (deps htmlrw_js_tests_main.bc.wasm.js) 83 - ; (action (copy %{deps} %{targets}))) 84 85 ; Install web assets to share/html5rw-js/ for npm packaging 86 - ; (install 87 - ; (package html5rw-js) 88 - ; (section share) 89 - ; (files 90 - ; ; JavaScript bundles 91 - ; htmlrw.js 92 - ; htmlrw-worker.js 93 - ; htmlrw-tests.js 94 - ; ; WASM loader scripts 95 - ; htmlrw.wasm.js 96 - ; htmlrw-worker.wasm.js 97 - ; htmlrw-tests.wasm.js 98 - ; ; WASM assets (with content-hashed filenames) 99 - ; (glob_files_rec (htmlrw_js_main.bc.wasm.assets/* with_prefix htmlrw_js_main.bc.wasm.assets)) 100 - ; (glob_files_rec (htmlrw_js_worker.bc.wasm.assets/* with_prefix htmlrw_js_worker.bc.wasm.assets)) 101 - ; (glob_files_rec (htmlrw_js_tests_main.bc.wasm.assets/* with_prefix htmlrw_js_tests_main.bc.wasm.assets))))
··· 1 ; HTML5rw JavaScript Validator Library 2 ; Compiled with js_of_ocaml for browser use 3 4 + (library 5 + (name htmlrw_js) 6 + (public_name html5rw.js) 7 + (libraries 8 + html5rw 9 + htmlrw_check 10 + bytesrw 11 + brr) 12 + (modes byte) ; js_of_ocaml requires bytecode 13 + (modules 14 + htmlrw_js_types 15 + htmlrw_js_dom 16 + htmlrw_js_annotate 17 + htmlrw_js_ui 18 + htmlrw_js)) 19 20 ; Standalone JavaScript file for direct browser use 21 ; This compiles the library entry point to a .js file 22 + (executable 23 + (name htmlrw_js_main) 24 + (libraries htmlrw_js) 25 + (js_of_ocaml 26 + (javascript_files)) 27 + (modes js wasm) 28 + (modules htmlrw_js_main)) 29 30 ; Web Worker for background validation 31 ; Runs validation in a separate thread to avoid blocking the UI 32 + (executable 33 + (name htmlrw_js_worker) 34 + (libraries html5rw htmlrw_check bytesrw brr) 35 + (js_of_ocaml 36 + (javascript_files)) 37 + (modes js wasm) 38 + (modules htmlrw_js_worker)) 39 40 ; Test runner for browser-based regression testing 41 ; Runs html5lib conformance tests in the browser 42 + (executable 43 + (name htmlrw_js_tests_main) 44 + (libraries html5rw bytesrw brr) 45 + (js_of_ocaml 46 + (javascript_files)) 47 + (modes js wasm) 48 + (modules htmlrw_js_tests htmlrw_js_tests_main)) 49 50 ; Copy to nice filenames (JS) 51 + (rule 52 + (targets htmlrw.js) 53 + (deps htmlrw_js_main.bc.js) 54 + (action (copy %{deps} %{targets}))) 55 56 + (rule 57 + (targets htmlrw-worker.js) 58 + (deps htmlrw_js_worker.bc.js) 59 + (action (copy %{deps} %{targets}))) 60 61 + (rule 62 + (targets htmlrw-tests.js) 63 + (deps htmlrw_js_tests_main.bc.js) 64 + (action (copy %{deps} %{targets}))) 65 66 ; Copy to nice filenames (WASM) 67 ; Note: requires wasm_of_ocaml-compiler to be installed 68 + (rule 69 + (targets htmlrw.wasm.js) 70 + (deps htmlrw_js_main.bc.wasm.js) 71 + (action (copy %{deps} %{targets}))) 72 73 + (rule 74 + (targets htmlrw-worker.wasm.js) 75 + (deps htmlrw_js_worker.bc.wasm.js) 76 + (action (copy %{deps} %{targets}))) 77 78 + (rule 79 + (targets htmlrw-tests.wasm.js) 80 + (deps htmlrw_js_tests_main.bc.wasm.js) 81 + (action (copy %{deps} %{targets}))) 82 83 ; Install web assets to share/html5rw-js/ for npm packaging 84 + (install 85 + (package html5rw-js) 86 + (section share) 87 + (files 88 + ; JavaScript bundles 89 + htmlrw.js 90 + htmlrw-worker.js 91 + htmlrw-tests.js 92 + ; WASM loader scripts 93 + htmlrw.wasm.js 94 + htmlrw-worker.wasm.js 95 + htmlrw-tests.wasm.js 96 + ; WASM assets (with content-hashed filenames) 97 + (glob_files_rec (htmlrw_js_main.bc.wasm.assets/* with_prefix htmlrw_js_main.bc.wasm.assets)) 98 + (glob_files_rec (htmlrw_js_worker.bc.wasm.assets/* with_prefix htmlrw_js_worker.bc.wasm.assets)) 99 + (glob_files_rec (htmlrw_js_tests_main.bc.wasm.assets/* with_prefix htmlrw_js_tests_main.bc.wasm.assets))))