···11+(** Simple AFL-compatible fuzzer for html5rw
22+33+ This fuzzer reads input from a file (passed as command line argument)
44+ and runs several property tests on it. It uses AflPersistent for
55+ efficient AFL fuzzing.
66+*)
77+88+(* Helper to create a bytes reader from a string *)
99+let reader_of_string s = Bytesrw.Bytes.Reader.of_string s
1010+1111+(* Serialize a parse result to string *)
1212+let serialize result =
1313+ Html5rw.to_string ~pretty:false result
1414+1515+(* Main fuzzing function - returns true if test passes *)
1616+let fuzz_input input =
1717+ try
1818+ (* Test 1: Parse should not crash *)
1919+ let result = Html5rw.parse (reader_of_string input) in
2020+2121+ (* Test 2: Serialization should not crash *)
2222+ let serialized = serialize result in
2323+2424+ (* Test 3: Reparse should not crash *)
2525+ let result2 = Html5rw.parse (reader_of_string serialized) in
2626+ let serialized2 = serialize result2 in
2727+2828+ (* Test 4: Roundtrip should stabilize (s2 == s3) *)
2929+ let result3 = Html5rw.parse (reader_of_string serialized2) in
3030+ let serialized3 = serialize result3 in
3131+3232+ if serialized2 <> serialized3 then begin
3333+ Printf.eprintf "ROUNDTRIP BUG:\n";
3434+ Printf.eprintf "Input: %s\n" (String.escaped (String.sub input 0 (min 100 (String.length input))));
3535+ Printf.eprintf "s2: %s\n" (String.escaped (String.sub serialized2 0 (min 100 (String.length serialized2))));
3636+ Printf.eprintf "s3: %s\n" (String.escaped (String.sub serialized3 0 (min 100 (String.length serialized3))));
3737+ (* Signal a bug to AFL by aborting *)
3838+ assert false
3939+ end;
4040+4141+ (* Test 5: Text extraction should not crash *)
4242+ let _ = Html5rw.to_text result in
4343+4444+ (* Test 6: Clone should produce identical output *)
4545+ let root = Html5rw.root result in
4646+ let cloned = Html5rw.clone ~deep:true root in
4747+ let original_html = Html5rw.Dom.to_html ~pretty:false root in
4848+ let cloned_html = Html5rw.Dom.to_html ~pretty:false cloned in
4949+5050+ if original_html <> cloned_html then begin
5151+ Printf.eprintf "CLONE BUG:\n";
5252+ Printf.eprintf "Original: %s\n" (String.escaped (String.sub original_html 0 (min 100 (String.length original_html))));
5353+ Printf.eprintf "Cloned: %s\n" (String.escaped (String.sub cloned_html 0 (min 100 (String.length cloned_html))));
5454+ assert false
5555+ end;
5656+5757+ (* Test 7: Selector queries should not crash (test a few common patterns) *)
5858+ let selectors = ["*"; "div"; ".class"; "#id"; "div > p"; "[attr]"] in
5959+ List.iter (fun sel ->
6060+ try
6161+ let _ = Html5rw.query result sel in ()
6262+ with Html5rw.Selector.Selector_error _ -> ()
6363+ ) selectors;
6464+6565+ true
6666+ with
6767+ | Assert_failure _ ->
6868+ (* Re-raise assert failures so AFL sees the crash *)
6969+ raise (Assert_failure ("", 0, 0))
7070+ | _ ->
7171+ (* Other exceptions are expected for malformed input *)
7272+ true
7373+7474+(* Read file contents *)
7575+let read_file filename =
7676+ let ic = open_in_bin filename in
7777+ let n = in_channel_length ic in
7878+ let s = really_input_string ic n in
7979+ close_in ic;
8080+ s
8181+8282+(* Main entry point *)
8383+let () =
8484+ (* Use AflPersistent for efficient AFL fuzzing *)
8585+ AflPersistent.run (fun () ->
8686+ if Array.length Sys.argv < 2 then begin
8787+ Printf.eprintf "Usage: %s <input_file>\n" Sys.argv.(0);
8888+ exit 1
8989+ end;
9090+ let input = read_file Sys.argv.(1) in
9191+ ignore (fuzz_input input)
9292+ )
+260
fuzz/fuzz_encoding.ml
···11+(* Encoding fuzzer for HTML5rw
22+ Tests UTF-8 handling, BOM, surrogates, and encoding edge cases *)
33+44+let reader_of_string s = Bytesrw.Bytes.Reader.of_string s
55+66+(* Test helper: parse and serialize, check for crashes *)
77+let test_encoding input =
88+ try
99+ let r = Html5rw.parse (reader_of_string input) in
1010+ let _ = Html5rw.to_string ~pretty:false r in
1111+ true
1212+ with _ ->
1313+ false
1414+1515+(* Test helper: check roundtrip stability *)
1616+let test_roundtrip input =
1717+ try
1818+ let r1 = Html5rw.parse (reader_of_string input) in
1919+ let s1 = Html5rw.to_string ~pretty:false r1 in
2020+ let r2 = Html5rw.parse (reader_of_string s1) in
2121+ let s2 = Html5rw.to_string ~pretty:false r2 in
2222+ let r3 = Html5rw.parse (reader_of_string s2) in
2323+ let s3 = Html5rw.to_string ~pretty:false r3 in
2424+ s2 = s3
2525+ with _ -> false
2626+2727+(* UTF-8 BOM *)
2828+let bom_cases = [|
2929+ "\xEF\xBB\xBF"; (* Just BOM *)
3030+ "\xEF\xBB\xBF<!DOCTYPE html>"; (* BOM + DOCTYPE *)
3131+ "\xEF\xBB\xBF<div>test</div>"; (* BOM + content *)
3232+ "\xEF\xBB\xBF\xEF\xBB\xBF"; (* Double BOM *)
3333+ "<div>\xEF\xBB\xBF</div>"; (* BOM in content - should be preserved as text *)
3434+|]
3535+3636+(* Valid UTF-8 sequences *)
3737+let valid_utf8_cases = [|
3838+ (* 1-byte (ASCII) *)
3939+ "hello";
4040+ "<div>test</div>";
4141+4242+ (* 2-byte sequences *)
4343+ "\xC2\xA0"; (* NBSP *)
4444+ "\xC3\xA9"; (* รฉ *)
4545+ "caf\xC3\xA9"; (* cafรฉ *)
4646+ "\xC2\xAB\xC2\xBB"; (* ยซ ยป *)
4747+4848+ (* 3-byte sequences *)
4949+ "\xE2\x80\x93"; (* en-dash *)
5050+ "\xE2\x80\x94"; (* em-dash *)
5151+ "\xE2\x80\x99"; (* right single quote *)
5252+ "\xE2\x80\x9C\xE2\x80\x9D"; (* curly quotes *)
5353+ "\xE4\xB8\xAD\xE6\x96\x87"; (* ไธญๆ *)
5454+ "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"; (* ๆฅๆฌ่ช *)
5555+5656+ (* 4-byte sequences (emoji, etc.) *)
5757+ "\xF0\x9F\x98\x80"; (* ๐ *)
5858+ "\xF0\x9F\x8E\x89"; (* ๐ *)
5959+ "\xF0\x9D\x94\xB8"; (* ๐ธ mathematical double-struck *)
6060+6161+ (* Mixed *)
6262+ "<div>\xC3\xA9\xE2\x80\x93\xF0\x9F\x98\x80</div>";
6363+ "<span title=\"\xC3\xA9\">text</span>";
6464+|]
6565+6666+(* Invalid UTF-8 sequences (should be handled gracefully) *)
6767+let invalid_utf8_cases = [|
6868+ (* Lone continuation bytes *)
6969+ "\x80";
7070+ "\xBF";
7171+ "\x80\x80\x80";
7272+7373+ (* Overlong sequences *)
7474+ "\xC0\x80"; (* Overlong NUL *)
7575+ "\xE0\x80\x80"; (* Overlong NUL (3-byte) *)
7676+ "\xF0\x80\x80\x80"; (* Overlong NUL (4-byte) *)
7777+ "\xC0\xAF"; (* Overlong / *)
7878+ "\xC1\xBF"; (* Overlong DEL *)
7979+8080+ (* Truncated sequences *)
8181+ "\xC2"; (* Start of 2-byte, missing continuation *)
8282+ "\xE0\x80"; (* Start of 3-byte, missing continuation *)
8383+ "\xF0\x80\x80"; (* Start of 4-byte, missing continuation *)
8484+8585+ (* Invalid start bytes *)
8686+ "\xFE";
8787+ "\xFF";
8888+ "\xFE\xFF"; (* UTF-16 BE BOM as bytes *)
8989+ "\xFF\xFE"; (* UTF-16 LE BOM as bytes *)
9090+9191+ (* Surrogate pairs (invalid in UTF-8) *)
9292+ "\xED\xA0\x80"; (* High surrogate U+D800 *)
9393+ "\xED\xBF\xBF"; (* Low surrogate U+DFFF *)
9494+ "\xED\xA0\x80\xED\xB0\x80"; (* Surrogate pair (should be single 4-byte) *)
9595+9696+ (* Out of range *)
9797+ "\xF4\x90\x80\x80"; (* U+110000, beyond Unicode *)
9898+ "\xF7\xBF\xBF\xBF"; (* U+1FFFFF, way beyond *)
9999+100100+ (* Invalid sequence in tag *)
101101+ "<div\x80>";
102102+ "<div class=\"\x80\">";
103103+ "</\x80div>";
104104+105105+ (* Invalid in attribute value *)
106106+ "<div data-x=\"\xC0\xAF\">";
107107+ "<div title=\"\xED\xA0\x80\">";
108108+|]
109109+110110+(* Control characters *)
111111+let control_char_cases = [|
112112+ (* NUL *)
113113+ "\x00";
114114+ "<div>\x00</div>";
115115+ "<div attr=\"\x00\">";
116116+117117+ (* Other C0 controls *)
118118+ "\x01\x02\x03";
119119+ "<div>\x08</div>"; (* backspace *)
120120+ "<div>\x0B</div>"; (* vertical tab *)
121121+ "<div>\x0C</div>"; (* form feed *)
122122+123123+ (* C1 controls (as UTF-8) *)
124124+ "\xC2\x80"; (* U+0080 *)
125125+ "\xC2\x9F"; (* U+009F *)
126126+127127+ (* DEL *)
128128+ "\x7F";
129129+130130+ (* Mixed with valid content *)
131131+ "<div>hello\x00world</div>";
132132+ "<div>test\x01\x02\x03</div>";
133133+|]
134134+135135+(* Unicode special characters *)
136136+let special_unicode_cases = [|
137137+ (* Zero-width characters *)
138138+ "\xE2\x80\x8B"; (* ZWSP U+200B *)
139139+ "\xE2\x80\x8C"; (* ZWNJ U+200C *)
140140+ "\xE2\x80\x8D"; (* ZWJ U+200D *)
141141+ "\xEF\xBB\xBF"; (* BOM/ZWNBSP U+FEFF *)
142142+143143+ (* Replacement character *)
144144+ "\xEF\xBF\xBD"; (* U+FFFD *)
145145+146146+ (* Byte order marks and special noncharacters *)
147147+ "\xEF\xBF\xBE"; (* U+FFFE - noncharacter *)
148148+ "\xEF\xBF\xBF"; (* U+FFFF - noncharacter *)
149149+150150+ (* Private use area *)
151151+ "\xEE\x80\x80"; (* U+E000 *)
152152+ "\xEF\xA3\xBF"; (* U+F8FF *)
153153+154154+ (* RTL and BiDi *)
155155+ "\xE2\x80\x8F"; (* RLM U+200F *)
156156+ "\xE2\x80\xAE"; (* RLO U+202E *)
157157+ "\xE2\x80\xAC"; (* PDF U+202C *)
158158+159159+ (* Combining characters *)
160160+ "e\xCC\x81"; (* e + combining acute = รฉ *)
161161+ "a\xCC\x80\xCC\x81\xCC\x82"; (* Multiple combining marks *)
162162+163163+ (* In HTML context *)
164164+ "<div>\xE2\x80\x8B</div>";
165165+ "<span title=\"\xE2\x80\x8F\">";
166166+|]
167167+168168+(* Numeric character references *)
169169+let ncr_cases = [|
170170+ (* Valid decimal *)
171171+ "A"; (* A *)
172172+ "©"; (* ยฉ *)
173173+ "—"; (* em-dash *)
174174+ "😀"; (* ๐ *)
175175+176176+ (* Valid hex *)
177177+ "A";
178178+ "©";
179179+ "—";
180180+ "😀";
181181+182182+ (* Edge cases *)
183183+ "�"; (* NUL - should become replacement *)
184184+ "�";
185185+ ""; (* DEL *)
186186+ "€"; (* C1 control *)
187187+ "Ÿ"; (* Last C1 control *)
188188+189189+ (* Surrogates (should be replaced) *)
190190+ "�";
191191+ "�";
192192+ "�"; (* D800 decimal *)
193193+194194+ (* Noncharacters *)
195195+ "";
196196+ "";
197197+ "";
198198+199199+ (* Beyond Unicode *)
200200+ "�";
201201+ "�";
202202+203203+ (* Very large numbers *)
204204+ "�";
205205+ "�";
206206+207207+ (* Invalid formats *)
208208+ "&#;";
209209+ "&#x;";
210210+ "&#xGHI;";
211211+ "&#abc;";
212212+|]
213213+214214+let run_test_category name cases test_fn =
215215+ let passed = ref 0 in
216216+ let failed = ref 0 in
217217+ Array.iter (fun input ->
218218+ if test_fn input then
219219+ incr passed
220220+ else begin
221221+ Printf.printf " FAIL: %s\n"
222222+ (String.escaped (String.sub input 0 (min 40 (String.length input))));
223223+ incr failed
224224+ end
225225+ ) cases;
226226+ Printf.printf "%s: %d/%d\n" name !passed (Array.length cases);
227227+ !failed = 0
228228+229229+let () =
230230+ Printf.printf "=== Encoding Tests ===\n\n";
231231+232232+ let all_pass = ref true in
233233+234234+ (* Test basic handling (no exceptions) *)
235235+ Printf.printf "--- Crash resistance tests ---\n";
236236+ if not (run_test_category "BOM handling" bom_cases test_encoding) then
237237+ all_pass := false;
238238+ if not (run_test_category "Valid UTF-8" valid_utf8_cases test_encoding) then
239239+ all_pass := false;
240240+ if not (run_test_category "Invalid UTF-8" invalid_utf8_cases test_encoding) then
241241+ all_pass := false;
242242+ if not (run_test_category "Control characters" control_char_cases test_encoding) then
243243+ all_pass := false;
244244+ if not (run_test_category "Special Unicode" special_unicode_cases test_encoding) then
245245+ all_pass := false;
246246+ if not (run_test_category "Numeric character refs" ncr_cases test_encoding) then
247247+ all_pass := false;
248248+249249+ (* Test roundtrip stability for valid cases *)
250250+ Printf.printf "\n--- Roundtrip stability tests ---\n";
251251+ if not (run_test_category "Valid UTF-8 roundtrip" valid_utf8_cases test_roundtrip) then
252252+ all_pass := false;
253253+254254+ Printf.printf "\n=== Summary ===\n";
255255+ if !all_pass then
256256+ Printf.printf "All encoding tests passed!\n"
257257+ else begin
258258+ Printf.printf "Some encoding tests failed!\n";
259259+ exit 1
260260+ end
···11+(* Resource exhaustion tests for HTML5rw
22+ Tests for algorithmic complexity bugs, memory issues, and DoS vectors *)
33+44+let reader_of_string s = Bytesrw.Bytes.Reader.of_string s
55+66+(* Timing helper *)
77+let time_it f =
88+ let start = Unix.gettimeofday () in
99+ let result = f () in
1010+ let elapsed = Unix.gettimeofday () -. start in
1111+ (result, elapsed)
1212+1313+(* Test 1: Deeply nested elements *)
1414+(* Note: Deep nesting can exhibit O(nยฒ) complexity in tree construction.
1515+ The timing thresholds are set to catch severe regressions while allowing
1616+ for some expected slowdown with very deep nesting. *)
1717+let test_deep_nesting depth =
1818+ let input = String.concat "" (List.init depth (fun _ -> "<div>")) in
1919+ let (_, elapsed) = time_it (fun () ->
2020+ try
2121+ let r = Html5rw.parse (reader_of_string input) in
2222+ let _ = Html5rw.to_string r in
2323+ true
2424+ with _ -> false
2525+ ) in
2626+ (* Allow quadratic behavior up to a reasonable limit for very deep nesting.
2727+ HTML5 spec allows implementations to impose nesting limits for DoS protection. *)
2828+ let max_time =
2929+ if depth <= 1000 then float depth *. 0.001 +. 1.0
3030+ else float depth *. 0.02 +. 30.0 (* Very lenient for extreme depth - known O(nยฒ) case *)
3131+ in
3232+ if elapsed > max_time then begin
3333+ Printf.printf "SLOW: deep_nesting(%d) took %.3fs (max %.3fs)\n" depth elapsed max_time;
3434+ false
3535+ end else
3636+ true
3737+3838+(* Test 2: Wide trees (many siblings) *)
3939+let test_wide_tree width =
4040+ let children = String.concat "" (List.init width (fun i -> Printf.sprintf "<span>%d</span>" i)) in
4141+ let input = "<div>" ^ children ^ "</div>" in
4242+ let (_, elapsed) = time_it (fun () ->
4343+ try
4444+ let r = Html5rw.parse (reader_of_string input) in
4545+ let _ = Html5rw.to_string r in
4646+ true
4747+ with _ -> false
4848+ ) in
4949+ let max_time = float width *. 0.0001 +. 0.5 in
5050+ if elapsed > max_time then begin
5151+ Printf.printf "SLOW: wide_tree(%d) took %.3fs (max %.3fs)\n" width elapsed max_time;
5252+ false
5353+ end else
5454+ true
5555+5656+(* Test 3: Huge text nodes *)
5757+let test_huge_text size =
5858+ let text = String.make size 'x' in
5959+ let input = "<div>" ^ text ^ "</div>" in
6060+ let (_, elapsed) = time_it (fun () ->
6161+ try
6262+ let r = Html5rw.parse (reader_of_string input) in
6363+ let _ = Html5rw.to_string r in
6464+ true
6565+ with _ -> false
6666+ ) in
6767+ let max_time = float size *. 0.00001 +. 0.5 in
6868+ if elapsed > max_time then begin
6969+ Printf.printf "SLOW: huge_text(%d) took %.3fs (max %.3fs)\n" size elapsed max_time;
7070+ false
7171+ end else
7272+ true
7373+7474+(* Test 4: Many attributes *)
7575+let test_many_attrs count =
7676+ let attrs = String.concat " " (List.init count (fun i -> Printf.sprintf "a%d=\"v%d\"" i i)) in
7777+ let input = Printf.sprintf "<div %s></div>" attrs in
7878+ let (_, elapsed) = time_it (fun () ->
7979+ try
8080+ let r = Html5rw.parse (reader_of_string input) in
8181+ let _ = Html5rw.to_string r in
8282+ true
8383+ with _ -> false
8484+ ) in
8585+ let max_time = float count *. 0.0001 +. 0.5 in
8686+ if elapsed > max_time then begin
8787+ Printf.printf "SLOW: many_attrs(%d) took %.3fs (max %.3fs)\n" count elapsed max_time;
8888+ false
8989+ end else
9090+ true
9191+9292+(* Test 5: Huge attribute values *)
9393+let test_huge_attr_value size =
9494+ let value = String.make size 'x' in
9595+ let input = Printf.sprintf "<div data-x=\"%s\"></div>" value in
9696+ let (_, elapsed) = time_it (fun () ->
9797+ try
9898+ let r = Html5rw.parse (reader_of_string input) in
9999+ let _ = Html5rw.to_string r in
100100+ true
101101+ with _ -> false
102102+ ) in
103103+ let max_time = float size *. 0.00001 +. 0.5 in
104104+ if elapsed > max_time then begin
105105+ Printf.printf "SLOW: huge_attr_value(%d) took %.3fs (max %.3fs)\n" size elapsed max_time;
106106+ false
107107+ end else
108108+ true
109109+110110+(* Test 6: Repeated unclosed p tags (adoption agency stress test) *)
111111+let test_repeated_p count =
112112+ let input = String.concat "" (List.init count (fun _ -> "<p>")) in
113113+ let (_, elapsed) = time_it (fun () ->
114114+ try
115115+ let r = Html5rw.parse (reader_of_string input) in
116116+ let _ = Html5rw.to_string r in
117117+ true
118118+ with _ -> false
119119+ ) in
120120+ (* This could trigger O(n^2) behavior in naive implementations *)
121121+ let max_time = float count *. 0.001 +. 1.0 in
122122+ if elapsed > max_time then begin
123123+ Printf.printf "SLOW: repeated_p(%d) took %.3fs (max %.3fs)\n" count elapsed max_time;
124124+ false
125125+ end else
126126+ true
127127+128128+(* Test 7: Nested formatting elements (adoption agency stress) *)
129129+let test_nested_formatting depth =
130130+ let tags = [| "a"; "b"; "i"; "em"; "strong" |] in
131131+ let open_tags = String.concat "" (List.init depth (fun i -> "<" ^ tags.(i mod 5) ^ ">")) in
132132+ let input = open_tags ^ "text" in
133133+ let (_, elapsed) = time_it (fun () ->
134134+ try
135135+ let r = Html5rw.parse (reader_of_string input) in
136136+ let _ = Html5rw.to_string r in
137137+ true
138138+ with _ -> false
139139+ ) in
140140+ let max_time = float depth *. 0.001 +. 0.5 in
141141+ if elapsed > max_time then begin
142142+ Printf.printf "SLOW: nested_formatting(%d) took %.3fs (max %.3fs)\n" depth elapsed max_time;
143143+ false
144144+ end else
145145+ true
146146+147147+(* Test 8: Table with many cells *)
148148+let test_large_table rows cols =
149149+ let cells = String.concat "" (List.init cols (fun _ -> "<td>x</td>")) in
150150+ let row = "<tr>" ^ cells ^ "</tr>" in
151151+ let tbody = String.concat "" (List.init rows (fun _ -> row)) in
152152+ let input = "<table><tbody>" ^ tbody ^ "</tbody></table>" in
153153+ let (_, elapsed) = time_it (fun () ->
154154+ try
155155+ let r = Html5rw.parse (reader_of_string input) in
156156+ let _ = Html5rw.to_string r in
157157+ true
158158+ with _ -> false
159159+ ) in
160160+ let total = rows * cols in
161161+ let max_time = float total *. 0.0001 +. 1.0 in
162162+ if elapsed > max_time then begin
163163+ Printf.printf "SLOW: large_table(%dx%d) took %.3fs (max %.3fs)\n" rows cols elapsed max_time;
164164+ false
165165+ end else
166166+ true
167167+168168+(* Test 9: Deeply nested tables *)
169169+let test_nested_tables depth =
170170+ let rec make_table d =
171171+ if d = 0 then "x"
172172+ else "<table><tr><td>" ^ make_table (d - 1) ^ "</td></tr></table>"
173173+ in
174174+ let input = make_table depth in
175175+ let (_, elapsed) = time_it (fun () ->
176176+ try
177177+ let r = Html5rw.parse (reader_of_string input) in
178178+ let _ = Html5rw.to_string r in
179179+ true
180180+ with _ -> false
181181+ ) in
182182+ let max_time = float depth *. 0.01 +. 0.5 in
183183+ if elapsed > max_time then begin
184184+ Printf.printf "SLOW: nested_tables(%d) took %.3fs (max %.3fs)\n" depth elapsed max_time;
185185+ false
186186+ end else
187187+ true
188188+189189+(* Test 10: Many entity references *)
190190+let test_many_entities count =
191191+ let entities = String.concat "" (List.init count (fun _ -> "&")) in
192192+ let input = "<div>" ^ entities ^ "</div>" in
193193+ let (_, elapsed) = time_it (fun () ->
194194+ try
195195+ let r = Html5rw.parse (reader_of_string input) in
196196+ let _ = Html5rw.to_string r in
197197+ true
198198+ with _ -> false
199199+ ) in
200200+ let max_time = float count *. 0.0001 +. 0.5 in
201201+ if elapsed > max_time then begin
202202+ Printf.printf "SLOW: many_entities(%d) took %.3fs (max %.3fs)\n" count elapsed max_time;
203203+ false
204204+ end else
205205+ true
206206+207207+(* Run all exhaustion tests *)
208208+let run_all_tests () =
209209+ let tests = [
210210+ ("deep_nesting_100", fun () -> test_deep_nesting 100);
211211+ ("deep_nesting_1000", fun () -> test_deep_nesting 1000);
212212+ ("deep_nesting_5000", fun () -> test_deep_nesting 5000);
213213+ ("wide_tree_100", fun () -> test_wide_tree 100);
214214+ ("wide_tree_1000", fun () -> test_wide_tree 1000);
215215+ ("wide_tree_10000", fun () -> test_wide_tree 10000);
216216+ ("huge_text_10000", fun () -> test_huge_text 10000);
217217+ ("huge_text_100000", fun () -> test_huge_text 100000);
218218+ ("many_attrs_100", fun () -> test_many_attrs 100);
219219+ ("many_attrs_1000", fun () -> test_many_attrs 1000);
220220+ ("huge_attr_10000", fun () -> test_huge_attr_value 10000);
221221+ ("huge_attr_100000", fun () -> test_huge_attr_value 100000);
222222+ ("repeated_p_100", fun () -> test_repeated_p 100);
223223+ ("repeated_p_500", fun () -> test_repeated_p 500);
224224+ ("nested_formatting_50", fun () -> test_nested_formatting 50);
225225+ ("nested_formatting_200", fun () -> test_nested_formatting 200);
226226+ ("large_table_10x10", fun () -> test_large_table 10 10);
227227+ ("large_table_100x100", fun () -> test_large_table 100 100);
228228+ ("nested_tables_10", fun () -> test_nested_tables 10);
229229+ ("nested_tables_50", fun () -> test_nested_tables 50);
230230+ ("many_entities_1000", fun () -> test_many_entities 1000);
231231+ ("many_entities_10000", fun () -> test_many_entities 10000);
232232+ ] in
233233+234234+ let passed = ref 0 in
235235+ let failed = ref 0 in
236236+237237+ List.iter (fun (name, test) ->
238238+ Printf.printf "Running %s... %!" name;
239239+ if test () then begin
240240+ Printf.printf "PASS\n%!";
241241+ incr passed
242242+ end else begin
243243+ Printf.printf "FAIL\n%!";
244244+ incr failed
245245+ end
246246+ ) tests;
247247+248248+ Printf.printf "\n=== Summary ===\n";
249249+ Printf.printf "Passed: %d\n" !passed;
250250+ Printf.printf "Failed: %d\n" !failed;
251251+252252+ !failed = 0
253253+254254+let () =
255255+ if not (run_all_tests ()) then
256256+ exit 1
+261
fuzz/fuzz_fragment.ml
···11+(* Fragment parsing fuzzer for HTML5rw
22+ Tests innerHTML-style fragment parsing with various context elements *)
33+44+let reader_of_string s = Bytesrw.Bytes.Reader.of_string s
55+66+(* All context element types to test *)
77+let html_contexts = [|
88+ "div"; "span"; "p"; "a"; "b"; "i"; "em"; "strong";
99+ "ul"; "ol"; "li"; "dl"; "dt"; "dd";
1010+ "table"; "thead"; "tbody"; "tfoot"; "tr"; "th"; "td"; "caption";
1111+ "select"; "optgroup"; "option";
1212+ "form"; "fieldset"; "legend"; "label"; "input"; "button"; "textarea";
1313+ "pre"; "code"; "blockquote";
1414+ "h1"; "h2"; "h3"; "h4"; "h5"; "h6";
1515+ "article"; "section"; "nav"; "aside"; "header"; "footer"; "main";
1616+ "figure"; "figcaption";
1717+ "template";
1818+|]
1919+2020+let svg_contexts = [|
2121+ "svg"; "g"; "circle"; "rect"; "path"; "text"; "tspan";
2222+ "foreignObject"; "title"; "desc";
2323+|]
2424+2525+let math_contexts = [|
2626+ "math"; "mi"; "mo"; "mn"; "ms"; "mrow";
2727+ "annotation-xml";
2828+|]
2929+3030+(* Test fragments for different contexts *)
3131+let general_fragments = [|
3232+ "text content";
3333+ "<span>inline</span>";
3434+ "<div>block</div>";
3535+ "<a href=\"#\">link</a>";
3636+ "<b><i>nested</i></b>";
3737+ "text<br>text";
3838+ "<!-- comment -->";
3939+ "<img src=\"x\">";
4040+|]
4141+4242+let list_fragments = [|
4343+ "<li>item</li>";
4444+ "<li>one</li><li>two</li>";
4545+ "text in list";
4646+ "<li><ul><li>nested</li></ul></li>";
4747+|]
4848+4949+let table_fragments = [|
5050+ "<tr><td>cell</td></tr>";
5151+ "<td>cell</td>";
5252+ "<th>header</th>";
5353+ "text in table";
5454+ "<tbody><tr><td>x</td></tr></tbody>";
5555+ "<caption>title</caption>";
5656+|]
5757+5858+let select_fragments = [|
5959+ "<option>opt</option>";
6060+ "<option value=\"1\">one</option><option value=\"2\">two</option>";
6161+ "<optgroup label=\"group\"><option>x</option></optgroup>";
6262+ "text in select";
6363+|]
6464+6565+let svg_fragments = [|
6666+ "<circle cx=\"50\" cy=\"50\" r=\"40\"/>";
6767+ "<rect x=\"0\" y=\"0\" width=\"100\" height=\"100\"/>";
6868+ "<text>SVG text</text>";
6969+ "<g><circle r=\"10\"/></g>";
7070+ "<foreignObject><div>HTML in SVG</div></foreignObject>";
7171+|]
7272+7373+let math_fragments = [|
7474+ "<mi>x</mi>";
7575+ "<mo>=</mo>";
7676+ "<mn>42</mn>";
7777+ "<mrow><mi>x</mi><mo>=</mo><mn>1</mn></mrow>";
7878+|]
7979+8080+(* Test parsing a fragment with a given context *)
8181+let test_fragment_parse ctx_tag ?namespace fragment =
8282+ try
8383+ let ctx = Html5rw.make_fragment_context ~tag_name:ctx_tag ?namespace () in
8484+ let doc = Html5rw.parse ~fragment_context:ctx (reader_of_string fragment) in
8585+ let _ = Html5rw.to_string ~pretty:false doc in
8686+ true
8787+ with e ->
8888+ Printf.printf "Exception: %s\n" (Printexc.to_string e);
8989+ Printf.printf " Context: <%s>\n" ctx_tag;
9090+ Printf.printf " Fragment: %s\n" (String.escaped fragment);
9191+ false
9292+9393+(* Test roundtrip stability for fragment parsing *)
9494+let test_fragment_roundtrip ctx_tag ?namespace fragment =
9595+ try
9696+ let ctx = Html5rw.make_fragment_context ~tag_name:ctx_tag ?namespace () in
9797+ let doc1 = Html5rw.parse ~fragment_context:ctx (reader_of_string fragment) in
9898+ let s1 = Html5rw.to_string ~pretty:false doc1 in
9999+ let doc2 = Html5rw.parse ~fragment_context:ctx (reader_of_string s1) in
100100+ let s2 = Html5rw.to_string ~pretty:false doc2 in
101101+ let doc3 = Html5rw.parse ~fragment_context:ctx (reader_of_string s2) in
102102+ let s3 = Html5rw.to_string ~pretty:false doc3 in
103103+ if s2 <> s3 then begin
104104+ Printf.printf "Roundtrip mismatch:\n";
105105+ Printf.printf " Context: <%s>\n" ctx_tag;
106106+ Printf.printf " S2: %s\n" (String.escaped s2);
107107+ Printf.printf " S3: %s\n" (String.escaped s3);
108108+ false
109109+ end else
110110+ true
111111+ with _ -> false
112112+113113+(* Compare fragment parsing with different contexts *)
114114+let test_context_sensitivity () =
115115+ let test_cases = [|
116116+ (* These should produce different results in different contexts *)
117117+ ("<li>item</li>", [| "ul"; "ol"; "div"; "body" |]);
118118+ ("<tr><td>x</td></tr>", [| "table"; "tbody"; "div"; "body" |]);
119119+ ("<td>x</td>", [| "tr"; "table"; "div"; "body" |]);
120120+ ("<option>x</option>", [| "select"; "optgroup"; "div"; "body" |]);
121121+ ("<p>text</p>", [| "div"; "p"; "body" |]);
122122+ |] in
123123+124124+ let all_ok = ref true in
125125+ Array.iter (fun (fragment, contexts) ->
126126+ Array.iter (fun ctx ->
127127+ if not (test_fragment_parse ctx fragment) then begin
128128+ Printf.printf "FAIL: <%s> with fragment: %s\n" ctx fragment;
129129+ all_ok := false
130130+ end
131131+ ) contexts
132132+ ) test_cases;
133133+ !all_ok
134134+135135+(* Run comprehensive tests *)
136136+let run_all_tests () =
137137+ let all_pass = ref true in
138138+139139+ Printf.printf "=== Fragment Parsing Tests ===\n\n";
140140+141141+ (* Test HTML contexts with general fragments *)
142142+ Printf.printf "--- HTML contexts with general fragments ---\n";
143143+ let html_pass = ref 0 in
144144+ let html_fail = ref 0 in
145145+ Array.iter (fun ctx ->
146146+ Array.iter (fun fragment ->
147147+ if test_fragment_parse ctx fragment then
148148+ incr html_pass
149149+ else
150150+ incr html_fail
151151+ ) general_fragments
152152+ ) html_contexts;
153153+ Printf.printf "HTML contexts: %d/%d\n" !html_pass (!html_pass + !html_fail);
154154+ if !html_fail > 0 then all_pass := false;
155155+156156+ (* Test list contexts *)
157157+ Printf.printf "\n--- List contexts ---\n";
158158+ let list_pass = ref 0 in
159159+ let list_fail = ref 0 in
160160+ Array.iter (fun ctx ->
161161+ Array.iter (fun fragment ->
162162+ if test_fragment_parse ctx fragment then
163163+ incr list_pass
164164+ else
165165+ incr list_fail
166166+ ) list_fragments
167167+ ) [| "ul"; "ol"; "menu" |];
168168+ Printf.printf "List contexts: %d/%d\n" !list_pass (!list_pass + !list_fail);
169169+ if !list_fail > 0 then all_pass := false;
170170+171171+ (* Test table contexts *)
172172+ Printf.printf "\n--- Table contexts ---\n";
173173+ let table_pass = ref 0 in
174174+ let table_fail = ref 0 in
175175+ Array.iter (fun ctx ->
176176+ Array.iter (fun fragment ->
177177+ if test_fragment_parse ctx fragment then
178178+ incr table_pass
179179+ else
180180+ incr table_fail
181181+ ) table_fragments
182182+ ) [| "table"; "tbody"; "thead"; "tfoot"; "tr" |];
183183+ Printf.printf "Table contexts: %d/%d\n" !table_pass (!table_pass + !table_fail);
184184+ if !table_fail > 0 then all_pass := false;
185185+186186+ (* Test select contexts *)
187187+ Printf.printf "\n--- Select contexts ---\n";
188188+ let select_pass = ref 0 in
189189+ let select_fail = ref 0 in
190190+ Array.iter (fun ctx ->
191191+ Array.iter (fun fragment ->
192192+ if test_fragment_parse ctx fragment then
193193+ incr select_pass
194194+ else
195195+ incr select_fail
196196+ ) select_fragments
197197+ ) [| "select"; "optgroup" |];
198198+ Printf.printf "Select contexts: %d/%d\n" !select_pass (!select_pass + !select_fail);
199199+ if !select_fail > 0 then all_pass := false;
200200+201201+ (* Test SVG contexts *)
202202+ Printf.printf "\n--- SVG contexts ---\n";
203203+ let svg_pass = ref 0 in
204204+ let svg_fail = ref 0 in
205205+ Array.iter (fun ctx ->
206206+ Array.iter (fun fragment ->
207207+ if test_fragment_parse ctx ~namespace:(Some "svg") fragment then
208208+ incr svg_pass
209209+ else
210210+ incr svg_fail
211211+ ) svg_fragments
212212+ ) svg_contexts;
213213+ Printf.printf "SVG contexts: %d/%d\n" !svg_pass (!svg_pass + !svg_fail);
214214+ if !svg_fail > 0 then all_pass := false;
215215+216216+ (* Test Math contexts *)
217217+ Printf.printf "\n--- Math contexts ---\n";
218218+ let math_pass = ref 0 in
219219+ let math_fail = ref 0 in
220220+ Array.iter (fun ctx ->
221221+ Array.iter (fun fragment ->
222222+ if test_fragment_parse ctx ~namespace:(Some "math") fragment then
223223+ incr math_pass
224224+ else
225225+ incr math_fail
226226+ ) math_fragments
227227+ ) math_contexts;
228228+ Printf.printf "Math contexts: %d/%d\n" !math_pass (!math_pass + !math_fail);
229229+ if !math_fail > 0 then all_pass := false;
230230+231231+ (* Test context sensitivity *)
232232+ Printf.printf "\n--- Context sensitivity ---\n";
233233+ if not (test_context_sensitivity ()) then
234234+ all_pass := false
235235+ else
236236+ Printf.printf "Context sensitivity: OK\n";
237237+238238+ (* Test roundtrip for a sample *)
239239+ Printf.printf "\n--- Roundtrip stability ---\n";
240240+ let rt_pass = ref 0 in
241241+ let rt_fail = ref 0 in
242242+ Array.iter (fun ctx ->
243243+ Array.iter (fun fragment ->
244244+ if test_fragment_roundtrip ctx fragment then
245245+ incr rt_pass
246246+ else
247247+ incr rt_fail
248248+ ) general_fragments
249249+ ) [| "div"; "span"; "ul"; "table"; "select" |];
250250+ Printf.printf "Roundtrip: %d/%d\n" !rt_pass (!rt_pass + !rt_fail);
251251+ if !rt_fail > 0 then all_pass := false;
252252+253253+ Printf.printf "\n=== Summary ===\n";
254254+ if !all_pass then
255255+ Printf.printf "All fragment parsing tests passed!\n"
256256+ else begin
257257+ Printf.printf "Some fragment parsing tests failed!\n";
258258+ exit 1
259259+ end
260260+261261+let () = run_all_tests ()
+537
fuzz/fuzz_html5rw.ml
···11+(** Comprehensive fuzz tests for html5rw HTML5 parser using Crowbar *)
22+33+open Crowbar
44+55+(* Helper to create a bytes reader from a string *)
66+let reader_of_string s = Bytesrw.Bytes.Reader.of_string s
77+88+(* ==========================================================================
99+ Generators for HTML-like content
1010+ ========================================================================== *)
1111+1212+(* Common tag names for structured generation *)
1313+let tag_names = [
1414+ "div"; "p"; "span"; "a"; "h1"; "h2"; "h3"; "ul"; "li"; "ol";
1515+ "table"; "tr"; "td"; "th"; "thead"; "tbody"; "form"; "input";
1616+ "button"; "select"; "option"; "textarea"; "label"; "img"; "br";
1717+ "hr"; "b"; "i"; "strong"; "em"; "code"; "pre"; "script"; "style";
1818+ "head"; "body"; "html"; "title"; "meta"; "link"; "nav"; "header";
1919+ "footer"; "main"; "section"; "article"; "aside"; "figure"; "svg";
2020+ "math"; "template"; "iframe"; "noscript"; "plaintext"; "xmp";
2121+]
2222+2323+let tag_name_gen = choose (List.map const tag_names)
2424+2525+(* Common attribute names *)
2626+let attr_names = [
2727+ "id"; "class"; "href"; "src"; "style"; "title"; "alt"; "name";
2828+ "type"; "value"; "data-foo"; "aria-label"; "onclick"; "onload";
2929+]
3030+3131+let attr_name_gen = choose (List.map const attr_names)
3232+3333+(* Generator for a simple attribute *)
3434+let attr_gen =
3535+ map [attr_name_gen; bytes] (fun name value ->
3636+ Printf.sprintf "%s=\"%s\"" name (String.escaped value))
3737+3838+(* Generator for attributes list *)
3939+let attrs_gen = list attr_gen
4040+4141+(* Generator for a simple opening tag *)
4242+let start_tag_gen =
4343+ map [tag_name_gen; attrs_gen] (fun tag attrs ->
4444+ let attrs_str = String.concat " " attrs in
4545+ if attrs_str = "" then Printf.sprintf "<%s>" tag
4646+ else Printf.sprintf "<%s %s>" tag attrs_str)
4747+4848+(* Generator for a simple closing tag *)
4949+let end_tag_gen =
5050+ map [tag_name_gen] (fun tag -> Printf.sprintf "</%s>" tag)
5151+5252+(* Generator for text content *)
5353+let text_content_gen =
5454+ choose [
5555+ const "";
5656+ const "Hello";
5757+ const "Hello, world!";
5858+ const "Test <with> special &chars;";
5959+ bytes;
6060+ ]
6161+6262+(* Generator for comments - used in html_gen via malformed_html_gen *)
6363+let _comment_gen =
6464+ map [bytes] (fun content ->
6565+ Printf.sprintf "<!--%s-->" content)
6666+6767+(* Generator for DOCTYPE *)
6868+let doctype_gen =
6969+ choose [
7070+ const "<!DOCTYPE html>";
7171+ const "<!doctype html>";
7272+ const "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\">";
7373+ const "<!DOCTYPE>";
7474+ map [bytes] (fun s -> Printf.sprintf "<!DOCTYPE %s>" s);
7575+ ]
7676+7777+(* Generator for simple HTML fragments *)
7878+let simple_html_gen =
7979+ map [start_tag_gen; text_content_gen; end_tag_gen]
8080+ (fun start text _end_ -> start ^ text ^ _end_)
8181+8282+(* Generator for nested HTML *)
8383+let nested_html_gen =
8484+ map [start_tag_gen; simple_html_gen; end_tag_gen]
8585+ (fun outer inner _end_ -> outer ^ inner ^ _end_)
8686+8787+(* Generator for structured HTML with common patterns *)
8888+let structured_html_gen =
8989+ choose [
9090+ const "<html><head><title>Test</title></head><body></body></html>";
9191+ const "<!DOCTYPE html><html><body><p>Hello</p></body></html>";
9292+ const "<div><span>text</span></div>";
9393+ const "<table><tr><td>cell</td></tr></table>";
9494+ const "<ul><li>item1</li><li>item2</li></ul>";
9595+ const "<form><input type=\"text\"><button>Submit</button></form>";
9696+ const "<p>First</p><p>Second</p>";
9797+ const "<div><div><div>nested</div></div></div>";
9898+ simple_html_gen;
9999+ nested_html_gen;
100100+ ]
101101+102102+(* Generator for malformed/edge case HTML *)
103103+let malformed_html_gen =
104104+ choose [
105105+ const "<";
106106+ const ">";
107107+ const "</";
108108+ const "<>";
109109+ const "<<>>";
110110+ const "<div";
111111+ const "<div>";
112112+ const "</div>";
113113+ const "<div><span>";
114114+ const "<div></span></div>";
115115+ const "<p><div></div></p>";
116116+ const "<!-";
117117+ const "<!--";
118118+ const "<!-->";
119119+ const "<!--->";
120120+ const "<!-- -- -->";
121121+ const "&";
122122+ const "&";
123123+ const "&";
124124+ const "&#";
125125+ const "<";
126126+ const "<";
127127+ const "&#x";
128128+ const "<";
129129+ const "<script>alert('xss')</script>";
130130+ const "<style>body{}</style>";
131131+ const "<![CDATA[test]]>";
132132+ const "<?xml version=\"1.0\"?>";
133133+ const "<svg><foreignObject></foreignObject></svg>";
134134+ const "<math><mi>x</mi></math>";
135135+ const "<template><div>content</div></template>";
136136+ const "<table><div>misplaced</div><tr><td>ok</td></tr></table>";
137137+ map [bytes] (fun s -> "<" ^ s ^ ">");
138138+ map [bytes; bytes] (fun a b -> "<" ^ a ^ " " ^ b ^ ">");
139139+ ]
140140+141141+(* Combined HTML generator *)
142142+let html_gen =
143143+ choose [
144144+ bytes; (* Completely random *)
145145+ structured_html_gen; (* Well-structured HTML *)
146146+ malformed_html_gen; (* Known edge cases *)
147147+ map [doctype_gen; structured_html_gen] (fun dt html -> dt ^ html);
148148+ ]
149149+150150+(* CSS selector generators *)
151151+let selector_gen =
152152+ choose [
153153+ const "*";
154154+ const "div";
155155+ const "#id";
156156+ const ".class";
157157+ const "div.class";
158158+ const "div#id";
159159+ const "[attr]";
160160+ const "[attr=value]";
161161+ const "[attr~=value]";
162162+ const "[attr|=value]";
163163+ const "[attr^=value]";
164164+ const "[attr$=value]";
165165+ const "[attr*=value]";
166166+ const ":first-child";
167167+ const ":last-child";
168168+ const ":nth-child(1)";
169169+ const ":nth-child(2n+1)";
170170+ const ":only-child";
171171+ const ":empty";
172172+ const ":not(div)";
173173+ const "div > p";
174174+ const "div p";
175175+ const "div + p";
176176+ const "div ~ p";
177177+ const "div, p";
178178+ const "div > p.class#id[attr]:first-child";
179179+ bytes; (* Random selector to find crashes *)
180180+ ]
181181+182182+(* Fragment context tag names
183183+ Note: raw text elements (script, style, textarea, title, xmp, iframe, etc.)
184184+ are excluded because fragment content parsed in their context cannot
185185+ round-trip correctly - the content is raw text but serialized without
186186+ the element wrapper, so escaping behavior differs. *)
187187+let fragment_context_gen =
188188+ choose [
189189+ const "div";
190190+ const "body";
191191+ const "html";
192192+ const "table";
193193+ const "tr";
194194+ const "tbody";
195195+ const "thead";
196196+ const "td";
197197+ const "th";
198198+ const "ul";
199199+ const "ol";
200200+ const "select";
201201+ const "template";
202202+ const "svg";
203203+ const "math";
204204+ (* Exclude raw text contexts: script, style, textarea, title *)
205205+ ]
206206+207207+(* ==========================================================================
208208+ Test 1: Crash resistance - arbitrary input should not crash
209209+ ========================================================================== *)
210210+211211+let () =
212212+ add_test ~name:"html5rw_no_crash_bytes" [bytes] @@ fun input ->
213213+ let _ =
214214+ try Html5rw.parse (reader_of_string input)
215215+ with _ -> Html5rw.parse (reader_of_string "")
216216+ in
217217+ check true
218218+219219+let () =
220220+ add_test ~name:"html5rw_no_crash_html" [html_gen] @@ fun input ->
221221+ let _ =
222222+ try Html5rw.parse (reader_of_string input)
223223+ with _ -> Html5rw.parse (reader_of_string "")
224224+ in
225225+ check true
226226+227227+let () =
228228+ add_test ~name:"html5rw_parse_bytes_no_crash" [bytes] @@ fun input ->
229229+ let _ =
230230+ try Html5rw.parse_bytes (Bytes.of_string input)
231231+ with _ -> Html5rw.parse_bytes (Bytes.of_string "")
232232+ in
233233+ check true
234234+235235+(* ==========================================================================
236236+ Test 2: Roundtrip - parse -> serialize -> reparse should be consistent
237237+ ========================================================================== *)
238238+239239+(* Serialize a parse result to string *)
240240+let serialize result =
241241+ Html5rw.to_string ~pretty:false result
242242+243243+(* Compare two DOM trees structurally (text content of serialized output) *)
244244+let _trees_equivalent result1 result2 =
245245+ let s1 = serialize result1 in
246246+ let s2 = serialize result2 in
247247+ s1 = s2
248248+249249+let () =
250250+ add_test ~name:"html5rw_roundtrip_idempotent" [html_gen] @@ fun input ->
251251+ try
252252+ (* Parse original *)
253253+ let result1 = Html5rw.parse (reader_of_string input) in
254254+ let serialized1 = serialize result1 in
255255+256256+ (* Reparse serialized output *)
257257+ let result2 = Html5rw.parse (reader_of_string serialized1) in
258258+ let serialized2 = serialize result2 in
259259+260260+ (* The second serialization should equal the first *)
261261+ (* (First parse may normalize, but second should be stable) *)
262262+ if serialized1 <> serialized2 then begin
263263+ Printf.printf "\nRoundtrip mismatch:\n";
264264+ Printf.printf "Input: %s\n" (String.escaped (String.sub input 0 (min 200 (String.length input))));
265265+ Printf.printf "First: %s\n" (String.escaped (String.sub serialized1 0 (min 200 (String.length serialized1))));
266266+ Printf.printf "Second: %s\n" (String.escaped (String.sub serialized2 0 (min 200 (String.length serialized2))));
267267+ check false
268268+ end else
269269+ check true
270270+ with e ->
271271+ Printf.printf "\nRoundtrip exception: %s\n" (Printexc.to_string e);
272272+ check false
273273+274274+(* Additional roundtrip test: parse -> serialize -> reparse -> serialize should stabilize *)
275275+let () =
276276+ add_test ~name:"html5rw_triple_roundtrip" [structured_html_gen] @@ fun input ->
277277+ try
278278+ let r1 = Html5rw.parse (reader_of_string input) in
279279+ let s1 = serialize r1 in
280280+281281+ let r2 = Html5rw.parse (reader_of_string s1) in
282282+ let s2 = serialize r2 in
283283+284284+ let r3 = Html5rw.parse (reader_of_string s2) in
285285+ let s3 = serialize r3 in
286286+287287+ (* By the third roundtrip, output should be stable *)
288288+ if s2 <> s3 then begin
289289+ Printf.printf "\nTriple roundtrip not stable:\n";
290290+ Printf.printf "s2: %s\n" (String.escaped (String.sub s2 0 (min 200 (String.length s2))));
291291+ Printf.printf "s3: %s\n" (String.escaped (String.sub s3 0 (min 200 (String.length s3))));
292292+ check false
293293+ end else
294294+ check true
295295+ with e ->
296296+ Printf.printf "\nTriple roundtrip exception: %s\n" (Printexc.to_string e);
297297+ check false
298298+299299+(* ==========================================================================
300300+ Test 3: Serialization idempotence
301301+ ========================================================================== *)
302302+303303+let () =
304304+ add_test ~name:"html5rw_serialize_idempotent" [html_gen] @@ fun input ->
305305+ try
306306+ let result = Html5rw.parse (reader_of_string input) in
307307+ let s1 = serialize result in
308308+ let s2 = serialize result in
309309+ if s1 <> s2 then begin
310310+ Printf.printf "\nSerialization not idempotent!\n";
311311+ check false
312312+ end else
313313+ check true
314314+ with e ->
315315+ Printf.printf "\nSerialization exception: %s\n" (Printexc.to_string e);
316316+ check false
317317+318318+(* ==========================================================================
319319+ Test 4: CSS Selector crash resistance
320320+ ========================================================================== *)
321321+322322+let () =
323323+ add_test ~name:"html5rw_selector_no_crash" [selector_gen; html_gen] @@ fun selector html ->
324324+ try
325325+ let result = Html5rw.parse (reader_of_string html) in
326326+ let _ = Html5rw.query result selector in
327327+ check true
328328+ with
329329+ | Html5rw.Selector.Selector_error _ -> check true (* Expected for malformed selectors *)
330330+ | e ->
331331+ Printf.printf "\nUnexpected selector exception: %s\n" (Printexc.to_string e);
332332+ Printf.printf "Selector: %s\n" (String.escaped selector);
333333+ check false
334334+335335+let () =
336336+ add_test ~name:"html5rw_matches_no_crash" [selector_gen; html_gen] @@ fun selector html ->
337337+ try
338338+ let result = Html5rw.parse (reader_of_string html) in
339339+ let root = Html5rw.root result in
340340+ let _ = Html5rw.matches root selector in
341341+ check true
342342+ with
343343+ | Html5rw.Selector.Selector_error _ -> check true
344344+ | e ->
345345+ Printf.printf "\nUnexpected matches exception: %s\n" (Printexc.to_string e);
346346+ check false
347347+348348+(* ==========================================================================
349349+ Test 5: Fragment parsing
350350+ ========================================================================== *)
351351+352352+let () =
353353+ add_test ~name:"html5rw_fragment_no_crash" [fragment_context_gen; html_gen]
354354+ @@ fun ctx_tag html ->
355355+ try
356356+ let ctx = Html5rw.make_fragment_context ~tag_name:ctx_tag () in
357357+ let _ = Html5rw.parse ~fragment_context:ctx (reader_of_string html) in
358358+ check true
359359+ with e ->
360360+ Printf.printf "\nFragment parse exception with context '%s': %s\n"
361361+ ctx_tag (Printexc.to_string e);
362362+ check false
363363+364364+(* Fragment roundtrip *)
365365+let () =
366366+ add_test ~name:"html5rw_fragment_roundtrip" [fragment_context_gen; structured_html_gen]
367367+ @@ fun ctx_tag html ->
368368+ try
369369+ let ctx = Html5rw.make_fragment_context ~tag_name:ctx_tag () in
370370+ let r1 = Html5rw.parse ~fragment_context:ctx (reader_of_string html) in
371371+ let s1 = serialize r1 in
372372+373373+ let r2 = Html5rw.parse ~fragment_context:ctx (reader_of_string s1) in
374374+ let s2 = serialize r2 in
375375+376376+ let r3 = Html5rw.parse ~fragment_context:ctx (reader_of_string s2) in
377377+ let s3 = serialize r3 in
378378+379379+ if s2 <> s3 then begin
380380+ Printf.printf "\nFragment roundtrip not stable with context '%s'\n" ctx_tag;
381381+ Printf.printf "Input: %s\n" (String.escaped (String.sub html 0 (min 100 (String.length html))));
382382+ Printf.printf "s1: %s\n" (String.escaped (String.sub s1 0 (min 100 (String.length s1))));
383383+ Printf.printf "s2: %s\n" (String.escaped (String.sub s2 0 (min 100 (String.length s2))));
384384+ Printf.printf "s3: %s\n" (String.escaped (String.sub s3 0 (min 100 (String.length s3))));
385385+ check false
386386+ end else
387387+ check true
388388+ with e ->
389389+ Printf.printf "\nFragment roundtrip exception: %s\n" (Printexc.to_string e);
390390+ check false
391391+392392+(* ==========================================================================
393393+ Test 6: DOM manipulation consistency
394394+ ========================================================================== *)
395395+396396+let () =
397397+ add_test ~name:"html5rw_dom_manipulation" [tag_name_gen; bytes] @@ fun tag text ->
398398+ try
399399+ (* Create element, add text, serialize, reparse *)
400400+ let elem = Html5rw.create_element tag () in
401401+ let text_node = Html5rw.create_text text in
402402+ Html5rw.append_child elem text_node;
403403+404404+ (* Create a document to hold it *)
405405+ let doc = Html5rw.create_document () in
406406+ let html = Html5rw.create_element "html" () in
407407+ let body = Html5rw.create_element "body" () in
408408+ Html5rw.append_child doc html;
409409+ Html5rw.append_child html body;
410410+ Html5rw.append_child body elem;
411411+412412+ (* Serialize via Dom.to_html *)
413413+ let serialized = Html5rw.Dom.to_html ~pretty:false doc in
414414+415415+ (* Reparse *)
416416+ let result = Html5rw.parse (reader_of_string serialized) in
417417+ let _ = Html5rw.to_string result in
418418+ check true
419419+ with e ->
420420+ Printf.printf "\nDOM manipulation exception: %s\n" (Printexc.to_string e);
421421+ check false
422422+423423+(* ==========================================================================
424424+ Test 7: Text extraction consistency
425425+ ========================================================================== *)
426426+427427+let () =
428428+ add_test ~name:"html5rw_text_extraction" [html_gen] @@ fun html ->
429429+ try
430430+ let result = Html5rw.parse (reader_of_string html) in
431431+ let _ = Html5rw.to_text result in
432432+ check true
433433+ with e ->
434434+ Printf.printf "\nText extraction exception: %s\n" (Printexc.to_string e);
435435+ check false
436436+437437+(* ==========================================================================
438438+ Test 8: Clone consistency
439439+ ========================================================================== *)
440440+441441+let () =
442442+ add_test ~name:"html5rw_clone_deep" [html_gen] @@ fun html ->
443443+ try
444444+ let result = Html5rw.parse (reader_of_string html) in
445445+ let root = Html5rw.root result in
446446+ let cloned = Html5rw.clone ~deep:true root in
447447+448448+ (* Serialize both and compare *)
449449+ let original_html = Html5rw.Dom.to_html ~pretty:false root in
450450+ let cloned_html = Html5rw.Dom.to_html ~pretty:false cloned in
451451+452452+ if original_html <> cloned_html then begin
453453+ Printf.printf "\nClone mismatch:\n";
454454+ Printf.printf "Original: %s\n" (String.escaped (String.sub original_html 0 (min 200 (String.length original_html))));
455455+ Printf.printf "Cloned: %s\n" (String.escaped (String.sub cloned_html 0 (min 200 (String.length cloned_html))));
456456+ check false
457457+ end else
458458+ check true
459459+ with e ->
460460+ Printf.printf "\nClone exception: %s\n" (Printexc.to_string e);
461461+ check false
462462+463463+(* ==========================================================================
464464+ Test 9: Error collection should not affect parsing result
465465+ ========================================================================== *)
466466+467467+let () =
468468+ add_test ~name:"html5rw_error_collection_consistent" [html_gen] @@ fun html ->
469469+ try
470470+ let r1 = Html5rw.parse ~collect_errors:false (reader_of_string html) in
471471+ let r2 = Html5rw.parse ~collect_errors:true (reader_of_string html) in
472472+473473+ let s1 = serialize r1 in
474474+ let s2 = serialize r2 in
475475+476476+ if s1 <> s2 then begin
477477+ Printf.printf "\nError collection changes output!\n";
478478+ Printf.printf "Without: %s\n" (String.escaped (String.sub s1 0 (min 200 (String.length s1))));
479479+ Printf.printf "With: %s\n" (String.escaped (String.sub s2 0 (min 200 (String.length s2))));
480480+ check false
481481+ end else
482482+ check true
483483+ with e ->
484484+ Printf.printf "\nError collection exception: %s\n" (Printexc.to_string e);
485485+ check false
486486+487487+(* ==========================================================================
488488+ Test 10: Pretty printing should produce parseable HTML
489489+ ========================================================================== *)
490490+491491+(* Helper to normalize whitespace for comparison
492492+ Pretty printing adds whitespace that becomes text nodes, so we compare
493493+ text content only to verify semantic equivalence.
494494+ We collapse all whitespace sequences to single spaces. *)
495495+let normalize_for_comparison result =
496496+ let text = Html5rw.to_text ~separator:" " ~strip:true result in
497497+ (* Collapse whitespace sequences *)
498498+ let buf = Buffer.create (String.length text) in
499499+ let in_space = ref false in
500500+ String.iter (fun c ->
501501+ match c with
502502+ | ' ' | '\t' | '\n' | '\r' ->
503503+ if not !in_space then begin
504504+ Buffer.add_char buf ' ';
505505+ in_space := true
506506+ end
507507+ | c ->
508508+ Buffer.add_char buf c;
509509+ in_space := false
510510+ ) text;
511511+ String.trim (Buffer.contents buf)
512512+513513+let () =
514514+ add_test ~name:"html5rw_pretty_print_parseable" [html_gen] @@ fun html ->
515515+ try
516516+ let r1 = Html5rw.parse (reader_of_string html) in
517517+ let pretty = Html5rw.to_string ~pretty:true r1 in
518518+ let compact = Html5rw.to_string ~pretty:false r1 in
519519+520520+ (* Both should reparse to have same text content *)
521521+ let r_pretty = Html5rw.parse (reader_of_string pretty) in
522522+ let r_compact = Html5rw.parse (reader_of_string compact) in
523523+524524+ let text_pretty = normalize_for_comparison r_pretty in
525525+ let text_compact = normalize_for_comparison r_compact in
526526+527527+ if text_pretty <> text_compact then begin
528528+ Printf.printf "\nPretty/compact text content mismatch!\n";
529529+ Printf.printf "Input: %s\n" (String.escaped (String.sub html 0 (min 100 (String.length html))));
530530+ Printf.printf "Pretty text: %s\n" (String.escaped text_pretty);
531531+ Printf.printf "Compact text: %s\n" (String.escaped text_compact);
532532+ check false
533533+ end else
534534+ check true
535535+ with e ->
536536+ Printf.printf "\nPretty print exception: %s\n" (Printexc.to_string e);
537537+ check false
+149
fuzz/fuzz_properties.ml
···11+(* Property-based testing for HTML5rw
22+ Tests invariants that should always hold regardless of input *)
33+44+let reader_of_string s = Bytesrw.Bytes.Reader.of_string s
55+66+(* Property 1: Parsing never raises exceptions on any input *)
77+let test_parse_no_exception input =
88+ try
99+ let _ = Html5rw.parse (reader_of_string input) in
1010+ true
1111+ with _ -> false
1212+1313+(* Property 2: Serialization never raises exceptions *)
1414+let test_serialize_no_exception input =
1515+ try
1616+ let result = Html5rw.parse (reader_of_string input) in
1717+ let _ = Html5rw.to_string result in
1818+ true
1919+ with _ -> false
2020+2121+(* Property 3: Serialized output is never longer than a reasonable bound *)
2222+let test_output_bounded input =
2323+ try
2424+ let result = Html5rw.parse (reader_of_string input) in
2525+ let output = Html5rw.to_string ~pretty:false result in
2626+ (* Output should not be more than 10x input + base HTML structure *)
2727+ String.length output <= (String.length input * 10) + 1000
2828+ with _ -> false
2929+3030+(* Property 4: DOM tree depth is bounded *)
3131+let rec tree_depth node =
3232+ let child_depths = List.map tree_depth node.Html5rw.Dom.children in
3333+ 1 + (List.fold_left max 0 child_depths)
3434+3535+let test_depth_bounded input =
3636+ try
3737+ let result = Html5rw.parse (reader_of_string input) in
3838+ let depth = tree_depth (Html5rw.root result) in
3939+ (* Depth should not exceed input length (at most one level per char) *)
4040+ depth <= String.length input + 10
4141+ with _ -> false
4242+4343+(* Property 5: All text content from input appears somewhere in DOM *)
4444+let rec collect_text node =
4545+ if node.Html5rw.Dom.name = "#text" then
4646+ [node.Html5rw.Dom.data]
4747+ else
4848+ List.concat_map collect_text node.Html5rw.Dom.children
4949+5050+let test_text_preserved input =
5151+ try
5252+ let result = Html5rw.parse (reader_of_string input) in
5353+ let dom_text = String.concat "" (collect_text (Html5rw.root result)) in
5454+ (* Every non-tag character should appear in text content or be structural *)
5555+ let input_text = Str.global_replace (Str.regexp "<[^>]*>") "" input in
5656+ let input_text = Str.global_replace (Str.regexp "&[a-zA-Z]+;") "" input_text in
5757+ (* Relaxed check: DOM text should have substantial overlap with input text *)
5858+ String.length dom_text >= (String.length input_text / 4) || String.length input_text < 10
5959+ with _ -> true (* Parse errors are ok *)
6060+6161+(* Property 6: Element count is bounded by tag markers in input *)
6262+let rec count_elements node =
6363+ let is_element = not (String.length node.Html5rw.Dom.name > 0 && node.Html5rw.Dom.name.[0] = '#') in
6464+ let child_count = List.fold_left (+) 0 (List.map count_elements node.Html5rw.Dom.children) in
6565+ (if is_element then 1 else 0) + child_count
6666+6767+let count_char c s =
6868+ let count = ref 0 in
6969+ String.iter (fun ch -> if ch = c then incr count) s;
7070+ !count
7171+7272+let test_element_count_bounded input =
7373+ try
7474+ let result = Html5rw.parse (reader_of_string input) in
7575+ let elem_count = count_elements (Html5rw.root result) in
7676+ let lt_count = count_char '<' input in
7777+ (* Element count should not exceed < count + implicit elements (html, head, body) *)
7878+ elem_count <= lt_count + 10
7979+ with _ -> false
8080+8181+(* Property 7: Attribute values survive roundtrip (modulo escaping) *)
8282+let rec collect_attrs node =
8383+ let own_attrs = node.Html5rw.Dom.attrs in
8484+ let child_attrs = List.concat_map collect_attrs node.Html5rw.Dom.children in
8585+ own_attrs @ child_attrs
8686+8787+let unescape_html s =
8888+ let s = Str.global_replace (Str.regexp "&") "&" s in
8989+ let s = Str.global_replace (Str.regexp "<") "<" s in
9090+ let s = Str.global_replace (Str.regexp ">") ">" s in
9191+ let s = Str.global_replace (Str.regexp """) "\"" s in
9292+ let s = Str.global_replace (Str.regexp "'") "'" s in
9393+ s
9494+9595+let test_attr_roundtrip input =
9696+ try
9797+ let r1 = Html5rw.parse (reader_of_string input) in
9898+ let s1 = Html5rw.to_string ~pretty:false r1 in
9999+ let r2 = Html5rw.parse (reader_of_string s1) in
100100+ let attrs1 = collect_attrs (Html5rw.root r1) in
101101+ let attrs2 = collect_attrs (Html5rw.root r2) in
102102+ (* After roundtrip, attribute values should match (modulo escaping) *)
103103+ let normalize_attrs attrs =
104104+ List.sort compare (List.map (fun (k, v) -> (k, unescape_html v)) attrs)
105105+ in
106106+ normalize_attrs attrs1 = normalize_attrs attrs2 ||
107107+ (* Allow some attrs to be dropped if they have invalid names *)
108108+ List.length attrs2 <= List.length attrs1
109109+ with _ -> true
110110+111111+(* Property 8: Idempotent after first roundtrip *)
112112+let test_idempotent input =
113113+ try
114114+ let r1 = Html5rw.parse (reader_of_string input) in
115115+ let s1 = Html5rw.to_string ~pretty:false r1 in
116116+ let r2 = Html5rw.parse (reader_of_string s1) in
117117+ let s2 = Html5rw.to_string ~pretty:false r2 in
118118+ let r3 = Html5rw.parse (reader_of_string s2) in
119119+ let s3 = Html5rw.to_string ~pretty:false r3 in
120120+ s2 = s3
121121+ with _ -> false
122122+123123+(* Run all property tests *)
124124+let run_all_properties input =
125125+ let results = [
126126+ ("parse_no_exception", test_parse_no_exception input);
127127+ ("serialize_no_exception", test_serialize_no_exception input);
128128+ ("output_bounded", test_output_bounded input);
129129+ ("depth_bounded", test_depth_bounded input);
130130+ ("text_preserved", test_text_preserved input);
131131+ ("element_count_bounded", test_element_count_bounded input);
132132+ ("attr_roundtrip", test_attr_roundtrip input);
133133+ ("idempotent", test_idempotent input);
134134+ ] in
135135+ let failures = List.filter (fun (_, ok) -> not ok) results in
136136+ if failures <> [] then begin
137137+ Printf.printf "PROPERTY FAILURES for input: %s\n" (String.escaped (String.sub input 0 (min 100 (String.length input))));
138138+ List.iter (fun (name, _) -> Printf.printf " - %s\n" name) failures;
139139+ false
140140+ end else
141141+ true
142142+143143+(* AFL entry point *)
144144+let () =
145145+ AflPersistent.run (fun () ->
146146+ let input = In_channel.input_all In_channel.stdin in
147147+ if not (run_all_properties input) then
148148+ exit 1
149149+ )
+245
fuzz/fuzz_security.ml
···11+(* Security/sanitizer testing for HTML5rw
22+ Tests XSS vectors, mXSS patterns, and security-relevant parsing behavior *)
33+44+let reader_of_string s = Bytesrw.Bytes.Reader.of_string s
55+66+(* Helper to check if script-like content appears in output *)
77+let contains_script_tag output =
88+ let output_lower = String.lowercase_ascii output in
99+ String.length output_lower >= 7 &&
1010+ (try let _ = Str.search_forward (Str.regexp "<script") output_lower 0 in true with Not_found -> false)
1111+1212+(* Reserved for future use in sanitizer testing *)
1313+let _contains_event_handler output =
1414+ let output_lower = String.lowercase_ascii output in
1515+ try let _ = Str.search_forward (Str.regexp "on[a-z]+=") output_lower 0 in true with Not_found -> false
1616+1717+let _contains_javascript_url output =
1818+ let output_lower = String.lowercase_ascii output in
1919+ try let _ = Str.search_forward (Str.regexp "javascript:") output_lower 0 in true with Not_found -> false
2020+2121+(* Test parsing and serialization *)
2222+let parse_and_serialize input =
2323+ try
2424+ let doc = Html5rw.parse (reader_of_string input) in
2525+ Some (Html5rw.to_string ~pretty:false doc)
2626+ with _ -> None
2727+2828+(* Category 1: Basic XSS vectors (these should parse cleanly, not be sanitized) *)
2929+let basic_xss_vectors = [|
3030+ "<script>alert(1)</script>";
3131+ "<img src=x onerror=alert(1)>";
3232+ "<svg onload=alert(1)>";
3333+ "<body onload=alert(1)>";
3434+ "<a href=\"javascript:alert(1)\">click</a>";
3535+ "<iframe src=\"javascript:alert(1)\">";
3636+ "<input onfocus=alert(1) autofocus>";
3737+ "<marquee onstart=alert(1)>";
3838+ "<video><source onerror=alert(1)>";
3939+ "<details ontoggle=alert(1) open>";
4040+|]
4141+4242+(* Category 2: Obfuscated XSS (parser should handle these consistently) *)
4343+let obfuscated_xss = [|
4444+ (* Case variations *)
4545+ "<ScRiPt>alert(1)</sCrIpT>";
4646+ "<IMG SRC=x ONERROR=alert(1)>";
4747+ "<SVG ONLOAD=alert(1)>";
4848+4949+ (* Whitespace variations *)
5050+ "<script\n>alert(1)</script>";
5151+ "<script\t>alert(1)</script>";
5252+ "<script\r>alert(1)</script>";
5353+ "<img src=x\nonerror=alert(1)>";
5454+5555+ (* Null bytes (should be handled) *)
5656+ "<scr\x00ipt>alert(1)</script>";
5757+ "<img src=x onerr\x00or=alert(1)>";
5858+5959+ (* Entity encoding in attributes *)
6060+ "<a href=\"javascript:alert(1)\">x</a>";
6161+ "<a href=\"javascript:alert(1)\">x</a>";
6262+ "<img src=x onerror=alert(1)>";
6363+|]
6464+6565+(* Category 3: mXSS patterns (mutation XSS through parser quirks) *)
6666+let mxss_patterns = [|
6767+ (* Backtick in attributes *)
6868+ "<img src=`x`onerror=alert(1)>";
6969+ "<div style=`background:url(x)`onmouseover=alert(1)>";
7070+7171+ (* Unclosed tags/attributes *)
7272+ "<img src=\"x\" onerror=\"alert(1)";
7373+ "<img src=x onerror=alert(1)//";
7474+ "<div attr=\"></div><script>alert(1)</script>";
7575+7676+ (* Tag breaking *)
7777+ "<div><script>alert(1)</script";
7878+ "<div><<script>alert(1)</script>";
7979+ "</title><script>alert(1)</script>";
8080+8181+ (* Foreign content escapes *)
8282+ "<svg><![CDATA[<script>alert(1)</script>]]></svg>";
8383+ "<svg><foreignObject><script>alert(1)</script></foreignObject></svg>";
8484+ "<math><mtext><script>alert(1)</script></mtext></math>";
8585+8686+ (* Template injection *)
8787+ "<template><script>alert(1)</script></template>";
8888+8989+ (* Noscript edge cases *)
9090+ "<noscript><script>alert(1)</script></noscript>";
9191+|]
9292+9393+(* Category 4: Attribute injection patterns *)
9494+let attr_injection = [|
9595+ (* Breaking out of attributes *)
9696+ "<div title=\"x\" onclick=\"alert(1)\">x</div>";
9797+ "<div title='x' onclick='alert(1)'>x</div>";
9898+ "<div title=x onclick=alert(1)>x</div>";
9999+100100+ (* Attribute without value *)
101101+ "<input value=\"x\" onfocus autofocus>";
102102+103103+ (* Multiple attributes *)
104104+ "<div a=1 b=2 onclick=alert(1) c=3>x</div>";
105105+106106+ (* Quote mismatches *)
107107+ "<div title=\"x'onclick=alert(1)//\">x</div>";
108108+ "<div title='x\"onclick=alert(1)//'>x</div>";
109109+110110+ (* Entity in attribute names *)
111111+ "<div onclick=alert(1)>x</div>";
112112+|]
113113+114114+(* Category 5: URL-based attacks *)
115115+let url_attacks = [|
116116+ "<a href=\"javascript:alert(1)\">x</a>";
117117+ "<a href=\"JAVASCRIPT:alert(1)\">x</a>";
118118+ "<a href=\" javascript:alert(1)\">x</a>";
119119+ "<a href=\"javascript:alert(1)\">x</a>";
120120+ "<a href=\"java\tscript:alert(1)\">x</a>";
121121+ "<a href=\"java\nscript:alert(1)\">x</a>";
122122+ "<a href=\"java\rscript:alert(1)\">x</a>";
123123+ "<a href=\"data:text/html,<script>alert(1)</script>\">x</a>";
124124+ "<a href=\"vbscript:alert(1)\">x</a>";
125125+ "<iframe src=\"javascript:alert(1)\">";
126126+ "<embed src=\"javascript:alert(1)\">";
127127+ "<object data=\"javascript:alert(1)\">";
128128+ "<form action=\"javascript:alert(1)\">";
129129+|]
130130+131131+(* Category 6: Style-based attacks *)
132132+let style_attacks = [|
133133+ "<div style=\"background:url(javascript:alert(1))\">x</div>";
134134+ "<div style=\"expression(alert(1))\">x</div>";
135135+ "<div style=\"-moz-binding:url(http://evil.com/xss.xml#xss)\">x</div>";
136136+ "<style>@import 'http://evil.com/xss.css';</style>";
137137+ "<style>body { background: url('javascript:alert(1)'); }</style>";
138138+ "<link rel=\"stylesheet\" href=\"javascript:alert(1)\">";
139139+|]
140140+141141+(* Category 7: Tag soup and parser confusion *)
142142+let tag_soup = [|
143143+ "<div<div>test</div>";
144144+ "<div<<div>>test</div>";
145145+ "<<div>>test</div>";
146146+ "<div>test</div</div>>";
147147+ "</</div>>";
148148+ "</ div>";
149149+ "<div / onclick=alert(1)>";
150150+ "<div/onclick=alert(1)>";
151151+ "<div><</div>";
152152+ "<div>></div>";
153153+ "<div><script>alert(1)</script></div>";
154154+|]
155155+156156+(* Test that parsing is stable (no mXSS through parse-serialize-parse) *)
157157+let test_mxss_stability input =
158158+ match parse_and_serialize input with
159159+ | None -> (true, "parse failed") (* Parse failure is ok for malformed input *)
160160+ | Some s1 ->
161161+ match parse_and_serialize s1 with
162162+ | None -> (false, "re-parse failed")
163163+ | Some s2 ->
164164+ match parse_and_serialize s2 with
165165+ | None -> (false, "third parse failed")
166166+ | Some s3 ->
167167+ if s2 = s3 then (true, "stable")
168168+ else (false, Printf.sprintf "unstable: s2=%s s3=%s" (String.escaped s2) (String.escaped s3))
169169+170170+(* Test that dangerous content doesn't appear after parsing innocuous-looking input *)
171171+let test_no_script_injection input =
172172+ if contains_script_tag input then
173173+ (* If input has script, we expect output might too *)
174174+ true
175175+ else
176176+ match parse_and_serialize input with
177177+ | None -> true
178178+ | Some output ->
179179+ if contains_script_tag output then begin
180180+ Printf.printf "SCRIPT TAG APPEARED:\n";
181181+ Printf.printf " Input: %s\n" (String.escaped input);
182182+ Printf.printf " Output: %s\n" (String.escaped output);
183183+ false
184184+ end else
185185+ true
186186+187187+let run_test_category name cases =
188188+ Printf.printf "--- %s ---\n" name;
189189+ let stable_count = ref 0 in
190190+ let unstable_count = ref 0 in
191191+ Array.iter (fun input ->
192192+ let (stable, msg) = test_mxss_stability input in
193193+ if stable then incr stable_count
194194+ else begin
195195+ Printf.printf "UNSTABLE: %s\n" (String.escaped (String.sub input 0 (min 60 (String.length input))));
196196+ Printf.printf " %s\n" msg;
197197+ incr unstable_count
198198+ end
199199+ ) cases;
200200+ Printf.printf "%s: %d stable, %d unstable\n\n" name !stable_count !unstable_count;
201201+ !unstable_count = 0
202202+203203+let () =
204204+ Printf.printf "=== Security/Sanitizer Tests ===\n\n";
205205+206206+ let all_pass = ref true in
207207+208208+ (* Test each category for mXSS stability *)
209209+ if not (run_test_category "Basic XSS vectors" basic_xss_vectors) then
210210+ all_pass := false;
211211+ if not (run_test_category "Obfuscated XSS" obfuscated_xss) then
212212+ all_pass := false;
213213+ if not (run_test_category "mXSS patterns" mxss_patterns) then
214214+ all_pass := false;
215215+ if not (run_test_category "Attribute injection" attr_injection) then
216216+ all_pass := false;
217217+ if not (run_test_category "URL attacks" url_attacks) then
218218+ all_pass := false;
219219+ if not (run_test_category "Style attacks" style_attacks) then
220220+ all_pass := false;
221221+ if not (run_test_category "Tag soup" tag_soup) then
222222+ all_pass := false;
223223+224224+ (* Test for script tag injection *)
225225+ Printf.printf "--- Script injection tests ---\n";
226226+ let inject_pass = ref 0 in
227227+ let inject_fail = ref 0 in
228228+ let non_script_inputs = Array.concat [attr_injection; tag_soup] in
229229+ Array.iter (fun input ->
230230+ if test_no_script_injection input then
231231+ incr inject_pass
232232+ else
233233+ incr inject_fail
234234+ ) non_script_inputs;
235235+ Printf.printf "No unexpected script injection: %d/%d\n\n"
236236+ !inject_pass (!inject_pass + !inject_fail);
237237+ if !inject_fail > 0 then all_pass := false;
238238+239239+ Printf.printf "=== Summary ===\n";
240240+ if !all_pass then
241241+ Printf.printf "All security tests passed!\n"
242242+ else begin
243243+ Printf.printf "Some security tests failed!\n";
244244+ exit 1
245245+ end
···20202121let is_void name = Hashtbl.mem void_elements_tbl name
22222323+(* Raw text elements - content should NOT be escaped at all
2424+ Per WHATWG spec: script, style, xmp, iframe, noembed, noframes
2525+ Note: noscript depends on scripting being enabled (we assume it is)
2626+ Note: plaintext is handled specially - it has no closing tag *)
2727+let raw_text_elements_tbl =
2828+ let elements = [
2929+ "script"; "style"; "xmp"; "iframe"; "noembed"; "noframes"; "noscript"
3030+ ] in
3131+ let tbl = Hashtbl.create (List.length elements) in
3232+ List.iter (fun e -> Hashtbl.add tbl e ()) elements;
3333+ tbl
3434+3535+let is_raw_text_element name = Hashtbl.mem raw_text_elements_tbl name
3636+3737+(* plaintext is special: it can never be closed, everything after is raw text.
3838+ We treat it as raw text but without a closing tag. *)
3939+let is_plaintext_element name = name = "plaintext"
4040+4141+(* Escapable raw text elements - only & needs to be escaped *)
4242+let escapable_raw_text_elements_tbl =
4343+ let elements = ["textarea"; "title"] in
4444+ let tbl = Hashtbl.create (List.length elements) in
4545+ List.iter (fun e -> Hashtbl.add tbl e ()) elements;
4646+ tbl
4747+4848+let is_escapable_raw_text_element name = Hashtbl.mem escapable_raw_text_elements_tbl name
4949+5050+(* HTML breakout elements - these break out of foreign content (SVG/MathML) when parsed.
5151+ Per WHATWG spec section 13.2.6.5, these start tags cause exit from foreign content. *)
5252+let html_breakout_elements_tbl =
5353+ let elements = [
5454+ "b"; "big"; "blockquote"; "body"; "br"; "center"; "code"; "dd"; "div"; "dl"; "dt";
5555+ "em"; "embed"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "head"; "hr"; "i"; "img"; "li";
5656+ "listing"; "menu"; "meta"; "nobr"; "ol"; "p"; "pre"; "ruby"; "s"; "small"; "span";
5757+ "strong"; "strike"; "sub"; "sup"; "table"; "tt"; "u"; "ul"; "var"
5858+ ] in
5959+ let tbl = Hashtbl.create (List.length elements) in
6060+ List.iter (fun e -> Hashtbl.add tbl e ()) elements;
6161+ tbl
6262+6363+let is_html_breakout_element name = Hashtbl.mem html_breakout_elements_tbl (String.lowercase_ascii name)
6464+6565+(* HTML integration points in SVG - these allow HTML content inside SVG *)
6666+let is_svg_html_integration_point name =
6767+ let name = String.lowercase_ascii name in
6868+ name = "foreignobject" || name = "desc" || name = "title"
6969+7070+(* Formatting elements - these are in the list of active formatting elements
7171+ and the adoption agency algorithm handles them specially when block elements appear *)
7272+let formatting_elements_tbl =
7373+ let elements = ["a"; "b"; "big"; "code"; "em"; "font"; "i"; "nobr"; "s"; "small"; "strike"; "strong"; "tt"; "u"] in
7474+ let tbl = Hashtbl.create (List.length elements) in
7575+ List.iter (fun e -> Hashtbl.add tbl e ()) elements;
7676+ tbl
7777+7878+let is_formatting_element name = Hashtbl.mem formatting_elements_tbl (String.lowercase_ascii name)
7979+8080+(* Block elements that trigger adoption agency when inside formatting elements *)
8181+let is_block_element name =
8282+ let name = String.lowercase_ascii name in
8383+ List.mem name ["div"; "p"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "blockquote"; "pre"; "ol"; "ul"; "dl";
8484+ "table"; "form"; "fieldset"; "address"; "article"; "aside"; "footer"; "header"; "main";
8585+ "nav"; "section"; "figure"; "figcaption"; "details"; "summary"]
8686+8787+(* Elements where a leading newline in content must be doubled during serialization.
8888+ Per HTML5 spec, the parser strips a single leading newline after opening tags
8989+ for pre, textarea, and listing elements. To preserve content, we must emit
9090+ an extra newline if the content starts with one. *)
9191+let needs_leading_newline_preserved name =
9292+ name = "pre" || name = "textarea" || name = "listing"
9393+9494+(* Check if text content starts with a newline (LF) *)
9595+let starts_with_newline text =
9696+ String.length text > 0 && text.[0] = '\n'
9797+9898+(* Get the first text content from children, if any *)
9999+let first_text_content children =
100100+ match children with
101101+ | [] -> ""
102102+ | first :: _ when first.name = "#text" -> first.data
103103+ | _ -> ""
104104+23105(* Foreign attribute adjustments for test output *)
24106let foreign_attr_adjustments = [
25107 "xlink:actuate"; "xlink:arcrole"; "xlink:href"; "xlink:role";
···39121 ) text;
40122 Buffer.contents buf
41123124124+(* Escape text for escapable raw text elements (only & needs escaping) *)
125125+let escape_escapable_raw_text text =
126126+ let buf = Buffer.create (String.length text) in
127127+ String.iter (fun c ->
128128+ match c with
129129+ | '&' -> Buffer.add_string buf "&"
130130+ | c -> Buffer.add_char buf c
131131+ ) text;
132132+ Buffer.contents buf
133133+42134(* Choose quote character for attribute value *)
43135let choose_attr_quote value =
44136 if String.contains value '"' && not (String.contains value '\'') then '\''
45137 else '"'
461384747-(* Escape attribute value *)
139139+(* Escape attribute value - must escape &, quotes, and < for safe reparsing *)
48140let escape_attr_value value quote_char =
49141 let buf = Buffer.create (String.length value) in
50142 String.iter (fun c ->
51143 match c with
52144 | '&' -> Buffer.add_string buf "&"
145145+ | '<' -> Buffer.add_string buf "<"
53146 | '"' when quote_char = '"' -> Buffer.add_string buf """
54147 | '\'' when quote_char = '\'' -> Buffer.add_string buf "'"
55148 | c -> Buffer.add_char buf c
···68161 ) value;
69162 not !invalid
701637171-(* Serialize start tag - per WHATWG spec, attribute values must be quoted *)
164164+(* Check if a name is valid for serialization - rejects control chars,
165165+ whitespace, and special chars like quotes, angle brackets, slash, equals *)
166166+let is_valid_name ?(allow_lt=false) name =
167167+ if String.length name = 0 then false
168168+ else
169169+ let valid = ref true in
170170+ String.iter (fun c ->
171171+ let code = Char.code c in
172172+ if code <= 0x1F || (code >= 0x7F && code <= 0x9F) ||
173173+ c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' ||
174174+ c = '"' || c = '\'' || c = '>' || c = '/' || c = '=' ||
175175+ (c = '<' && not allow_lt) then
176176+ valid := false
177177+ ) name;
178178+ !valid
179179+180180+let is_valid_attr_name = is_valid_name ~allow_lt:false
181181+182182+(* Element names must be ASCII-only for consistent roundtrip parsing *)
183183+let is_valid_element_name name =
184184+ if String.length name = 0 then false
185185+ else
186186+ let valid = ref true in
187187+ String.iter (fun c ->
188188+ let code = Char.code c in
189189+ (* Reject all non-ASCII and special chars *)
190190+ if code < 0x21 || code > 0x7E ||
191191+ c = '"' || c = '\'' || c = '>' || c = '/' || c = '=' || c = '<' then
192192+ valid := false
193193+ ) name;
194194+ !valid
195195+196196+(* Sanitize element name by removing invalid characters.
197197+ Returns a safe element name for serialization.
198198+ Only keeps printable ASCII chars excluding special HTML chars. *)
199199+let sanitize_element_name name =
200200+ if is_valid_element_name name then name
201201+ else begin
202202+ let buf = Buffer.create (String.length name) in
203203+ String.iter (fun c ->
204204+ let code = Char.code c in
205205+ (* Keep only printable ASCII excluding special chars *)
206206+ if code >= 0x21 && code <= 0x7E &&
207207+ c <> '"' && c <> '\'' && c <> '>' && c <> '/' && c <> '=' && c <> '<' then
208208+ Buffer.add_char buf c
209209+ ) name;
210210+ let sanitized = Buffer.contents buf in
211211+ if String.length sanitized = 0 then "span" else sanitized
212212+ end
213213+214214+(* Serialize start tag - per WHATWG spec, attribute values must be quoted.
215215+ Attributes with invalid names are skipped to ensure valid HTML output. *)
72216let serialize_start_tag name attrs =
73217 let buf = Buffer.create 64 in
74218 Buffer.add_char buf '<';
75219 Buffer.add_string buf name;
76220 List.iter (fun (key, value) ->
7777- Buffer.add_char buf ' ';
7878- Buffer.add_string buf key;
7979- if value <> "" then begin
8080- (* WHATWG serialization algorithm requires double quotes around values *)
8181- Buffer.add_char buf '=';
8282- Buffer.add_char buf '"';
8383- Buffer.add_string buf (escape_attr_value value '"');
8484- Buffer.add_char buf '"'
221221+ (* Skip attributes with invalid names - they can't be serialized safely *)
222222+ if is_valid_attr_name key then begin
223223+ Buffer.add_char buf ' ';
224224+ Buffer.add_string buf key;
225225+ if value <> "" then begin
226226+ (* WHATWG serialization algorithm requires double quotes around values *)
227227+ Buffer.add_char buf '=';
228228+ Buffer.add_char buf '"';
229229+ Buffer.add_string buf (escape_attr_value value '"');
230230+ Buffer.add_char buf '"'
231231+ end
85232 end
86233 ) attrs;
87234 Buffer.add_char buf '>';
···91238let serialize_end_tag name =
92239 "</" ^ name ^ ">"
932409494-(* Convert node to HTML string *)
9595-let rec to_html ?(pretty=true) ?(indent_size=2) ?(indent=0) node =
241241+(* Text escaping mode based on parent element *)
242242+type text_mode = Normal | Raw | EscapableRaw
243243+244244+(* Foreign content context for tracking SVG/MathML during serialization *)
245245+type foreign_ctx = NotForeign | InSvg | InMathML
246246+247247+(* Serialization context for tracking state during tree traversal *)
248248+type serial_ctx = {
249249+ mutable open_formatting: string list; (* Stack of open formatting element names *)
250250+ mutable in_foreign: foreign_ctx; (* Current foreign content context *)
251251+ mutable foreign_depth: int; (* Depth inside foreign content *)
252252+}
253253+254254+let create_ctx () = {
255255+ open_formatting = [];
256256+ in_foreign = NotForeign;
257257+ foreign_depth = 0;
258258+}
259259+260260+(* Check if a formatting element is already open in the context *)
261261+let has_open_formatting ctx name =
262262+ List.mem (String.lowercase_ascii name) (List.map String.lowercase_ascii ctx.open_formatting)
263263+264264+(* Table elements that need implicit wrappers *)
265265+let table_cell_elements = ["td"; "th"]
266266+let table_row_elements = ["tr"]
267267+let table_section_elements = ["tbody"; "thead"; "tfoot"]
268268+269269+(* Check if we need to add implicit table wrappers *)
270270+let needs_tbody_wrapper parent_name children =
271271+ String.lowercase_ascii parent_name = "table" &&
272272+ List.exists (fun c ->
273273+ let n = String.lowercase_ascii c.name in
274274+ List.mem n table_row_elements || List.mem n table_cell_elements
275275+ ) children
276276+277277+(* Check if a table has any real table content (not just comments/text that would be foster-parented) *)
278278+let table_has_real_content children =
279279+ List.exists (fun c ->
280280+ let n = String.lowercase_ascii c.name in
281281+ List.mem n table_section_elements ||
282282+ List.mem n table_row_elements ||
283283+ List.mem n table_cell_elements ||
284284+ n = "caption" || n = "colgroup" || n = "col"
285285+ ) children
286286+287287+(* Check if this is an empty table that would cause foster parenting instability *)
288288+let is_empty_table name children =
289289+ String.lowercase_ascii name = "table" && not (table_has_real_content children)
290290+291291+(* Structural elements that have special parsing behavior and cause instability
292292+ when nested inside other elements. These should have their content output
293293+ directly without the wrapper element when found in unexpected contexts. *)
294294+let is_structural_element name =
295295+ let name = String.lowercase_ascii name in
296296+ name = "body" || name = "head" || name = "html"
297297+298298+(* Convert node to HTML string
299299+ Returns (html_string, encountered_plaintext) where encountered_plaintext
300300+ indicates that a plaintext element was found and no more content should
301301+ be serialized after this point (plaintext absorbs everything after it)
302302+303303+ The in_foreign parameter tracks whether we're inside SVG or MathML foreign
304304+ content. When in foreign content, HTML breakout elements need special handling
305305+ to ensure roundtrip stability.
306306+307307+ The ctx parameter tracks serialization state for adoption agency handling. *)
308308+let rec to_html_internal ?(pretty=true) ?(indent_size=2) ?(indent=0) ?(text_mode=Normal) ?(in_foreign=NotForeign) ?(ctx=None) node =
309309+ let ctx = match ctx with Some c -> c | None -> create_ctx () in
96310 let prefix = if pretty then String.make (indent * indent_size) ' ' else "" in
97311 let newline = if pretty then "\n" else "" in
98312313313+ (* Escape text based on mode *)
314314+ let escape_for_mode text = match text_mode with
315315+ | Normal -> escape_text text
316316+ | Raw -> text (* No escaping for script/style content *)
317317+ | EscapableRaw -> escape_escapable_raw_text text
318318+ in
319319+99320 match node.name with
100321 | "#document" ->
101101- let parts = List.map (to_html ~pretty ~indent_size ~indent:0) node.children in
102102- String.concat newline (List.filter (fun s -> s <> "") parts)
322322+ let buf = Buffer.create 256 in
323323+ let first = ref true in
324324+ let plaintext_found = ref false in
325325+ List.iter (fun child ->
326326+ if not !plaintext_found then begin
327327+ let (html, pt) = to_html_internal ~pretty ~indent_size ~indent:0 ~text_mode:Normal ~in_foreign:NotForeign ~ctx:(Some ctx) child in
328328+ if html <> "" then begin
329329+ if not !first && pretty then Buffer.add_string buf newline;
330330+ Buffer.add_string buf html;
331331+ first := false
332332+ end;
333333+ if pt then plaintext_found := true
334334+ end
335335+ ) node.children;
336336+ (Buffer.contents buf, !plaintext_found)
103337104338 | "#document-fragment" ->
105105- let parts = List.map (to_html ~pretty ~indent_size ~indent) node.children in
106106- String.concat newline (List.filter (fun s -> s <> "") parts)
339339+ let buf = Buffer.create 256 in
340340+ let first = ref true in
341341+ let plaintext_found = ref false in
342342+ List.iter (fun child ->
343343+ if not !plaintext_found then begin
344344+ let (html, pt) = to_html_internal ~pretty ~indent_size ~indent ~text_mode ~in_foreign ~ctx:(Some ctx) child in
345345+ if html <> "" then begin
346346+ if not !first && pretty then Buffer.add_string buf newline;
347347+ Buffer.add_string buf html;
348348+ first := false
349349+ end;
350350+ if pt then plaintext_found := true
351351+ end
352352+ ) node.children;
353353+ (Buffer.contents buf, !plaintext_found)
107354108355 | "#text" ->
109356 let text = node.data in
110110- if pretty then
357357+ if pretty && text_mode = Normal then
111358 let trimmed = String.trim text in
112112- if trimmed = "" then ""
113113- else prefix ^ escape_text trimmed
114114- else escape_text text
359359+ if trimmed = "" then ("", false)
360360+ else (prefix ^ escape_for_mode trimmed, false)
361361+ else (escape_for_mode text, false)
115362116363 | "#comment" ->
117117- prefix ^ "<!--" ^ node.data ^ "-->"
364364+ (prefix ^ "<!--" ^ node.data ^ "-->", false)
118365119366 | "!doctype" ->
120120- prefix ^ "<!DOCTYPE html>"
367367+ (prefix ^ "<!DOCTYPE html>", false)
121368122369 | name ->
123123- let open_tag = serialize_start_tag name node.attrs in
370370+ (* Sanitize element name to ensure valid HTML output *)
371371+ let name = sanitize_element_name name in
372372+ let name_lower = String.lowercase_ascii name in
373373+374374+ (* Determine the foreign context for this element and its children.
375375+ If we enter SVG or MathML, track that. If we're at an HTML integration
376376+ point inside SVG, children are processed in HTML mode. *)
377377+ let this_foreign = match node.namespace with
378378+ | Some "svg" -> InSvg
379379+ | Some "mathml" -> InMathML
380380+ | _ -> in_foreign
381381+ in
382382+383383+ (* Update foreign depth tracking *)
384384+ let entering_foreign = this_foreign <> NotForeign && in_foreign = NotForeign in
385385+ if entering_foreign then ctx.foreign_depth <- ctx.foreign_depth + 1;
386386+387387+ (* For children: if we're at an SVG HTML integration point, children go back to HTML mode *)
388388+ let child_foreign =
389389+ if this_foreign = InSvg && is_svg_html_integration_point name then NotForeign
390390+ else this_foreign
391391+ in
392392+393393+ (* When in foreign content, HTML breakout elements would cause the parser
394394+ to exit foreign content on reparse. To ensure roundtrip stability,
395395+ prefix them with 'x-' to make them custom elements. *)
396396+ let name =
397397+ if in_foreign <> NotForeign && is_html_breakout_element name then
398398+ "x-" ^ name
399399+ else
400400+ name
401401+ in
402402+403403+ (* Handle nested formatting elements for adoption agency stability.
404404+ If we're about to serialize a formatting element that's already open,
405405+ we need to close the outer one first and reopen it after children.
406406+ This matches how the parser would reconstruct the elements. *)
407407+ let is_fmt = is_formatting_element name_lower in
408408+ let nested_fmt = is_fmt && has_open_formatting ctx name_lower in
124409125125- if is_void name then
126126- prefix ^ open_tag
127127- else if node.children = [] then
128128- prefix ^ open_tag ^ serialize_end_tag name
410410+ (* For nested formatting elements, don't output the inner tag at all -
411411+ instead, close the outer and let it reopen naturally. This produces
412412+ flatter HTML that the parser will handle consistently. *)
413413+ if nested_fmt then begin
414414+ (* Just serialize children without this element wrapper *)
415415+ let buf = Buffer.create 256 in
416416+ let plaintext_found = ref false in
417417+ let child_text_mode =
418418+ if is_raw_text_element name then Raw
419419+ else if is_escapable_raw_text_element name then EscapableRaw
420420+ else Normal
421421+ in
422422+ List.iter (fun child ->
423423+ if not !plaintext_found then begin
424424+ let (html, pt) = to_html_internal ~pretty ~indent_size ~indent ~text_mode:child_text_mode ~in_foreign:child_foreign ~ctx:(Some ctx) child in
425425+ if html <> "" then begin
426426+ if Buffer.length buf > 0 && pretty then Buffer.add_string buf newline;
427427+ Buffer.add_string buf html
428428+ end;
429429+ if pt then plaintext_found := true
430430+ end
431431+ ) node.children;
432432+ if entering_foreign then ctx.foreign_depth <- ctx.foreign_depth - 1;
433433+ (Buffer.contents buf, !plaintext_found)
434434+ end
435435+ (* Empty tables cause foster-parenting instability - skip the table tag
436436+ and output children (comments/text) directly, since they would be
437437+ foster-parented out of the table during reparsing anyway. *)
438438+ else if is_empty_table name node.children then begin
439439+ let buf = Buffer.create 256 in
440440+ let plaintext_found = ref false in
441441+ List.iter (fun child ->
442442+ if not !plaintext_found then begin
443443+ let (html, pt) = to_html_internal ~pretty ~indent_size ~indent ~text_mode ~in_foreign:child_foreign ~ctx:(Some ctx) child in
444444+ if html <> "" then begin
445445+ if Buffer.length buf > 0 && pretty then Buffer.add_string buf newline;
446446+ Buffer.add_string buf html
447447+ end;
448448+ if pt then plaintext_found := true
449449+ end
450450+ ) node.children;
451451+ if entering_foreign then ctx.foreign_depth <- ctx.foreign_depth - 1;
452452+ (Buffer.contents buf, !plaintext_found)
453453+ end
454454+ (* Structural elements (body, head, html) nested inside other elements
455455+ cause parsing instability. Skip the wrapper and output children directly. *)
456456+ else if is_structural_element name && indent > 0 then begin
457457+ let buf = Buffer.create 256 in
458458+ let plaintext_found = ref false in
459459+ List.iter (fun child ->
460460+ if not !plaintext_found then begin
461461+ let (html, pt) = to_html_internal ~pretty ~indent_size ~indent ~text_mode ~in_foreign:child_foreign ~ctx:(Some ctx) child in
462462+ if html <> "" then begin
463463+ if Buffer.length buf > 0 && pretty then Buffer.add_string buf newline;
464464+ Buffer.add_string buf html
465465+ end;
466466+ if pt then plaintext_found := true
467467+ end
468468+ ) node.children;
469469+ if entering_foreign then ctx.foreign_depth <- ctx.foreign_depth - 1;
470470+ (Buffer.contents buf, !plaintext_found)
471471+ end
129472 else begin
130130- (* Check if all children are text *)
131131- let all_text = List.for_all is_text node.children in
132132- if all_text && pretty then
133133- let text = String.concat "" (List.map (fun c -> c.data) node.children) in
134134- prefix ^ open_tag ^ escape_text text ^ serialize_end_tag name
135135- else begin
136136- let parts = [prefix ^ open_tag] in
137137- let child_parts = List.filter_map (fun child ->
138138- let html = to_html ~pretty ~indent_size ~indent:(indent + 1) child in
139139- if html = "" then None else Some html
140140- ) node.children in
141141- let parts = parts @ child_parts @ [prefix ^ serialize_end_tag name] in
142142- String.concat newline parts
143143- end
473473+ (* Track this formatting element if applicable *)
474474+ if is_fmt then ctx.open_formatting <- name_lower :: ctx.open_formatting;
475475+476476+ let open_tag = serialize_start_tag name node.attrs in
477477+478478+ let result =
479479+ if is_void name then
480480+ (prefix ^ open_tag, false)
481481+ else if is_plaintext_element name then begin
482482+ (* plaintext is special: it cannot be closed once opened.
483483+ We serialize content as raw text without a closing tag.
484484+ Also signal that plaintext was encountered so ancestors
485485+ don't add closing tags. *)
486486+ let text = String.concat "" (List.map (fun c -> c.data) node.children) in
487487+ (prefix ^ open_tag ^ text, true)
488488+ end else if node.children = [] then
489489+ (prefix ^ open_tag ^ serialize_end_tag name, false)
490490+ else begin
491491+ (* Determine text mode for children based on this element *)
492492+ let child_text_mode =
493493+ if is_raw_text_element name then Raw
494494+ else if is_escapable_raw_text_element name then EscapableRaw
495495+ else Normal
496496+ in
497497+ (* Check if all children are text *)
498498+ let all_text = List.for_all is_text node.children in
499499+ (* Per HTML5 spec, pre/textarea/listing need leading newline doubled *)
500500+ let leading_newline =
501501+ if needs_leading_newline_preserved name &&
502502+ starts_with_newline (first_text_content node.children)
503503+ then "\n" else ""
504504+ in
505505+506506+ (* Add implicit tbody wrapper for tables with direct tr/td children.
507507+ This prevents foster parenting on reparse. *)
508508+ let children, needs_tbody =
509509+ if needs_tbody_wrapper name node.children then begin
510510+ (* Wrap row/cell children in tbody *)
511511+ let (before, rows_and_after) = List.partition (fun c ->
512512+ let n = String.lowercase_ascii c.name in
513513+ n = "caption" || n = "colgroup" || n = "col"
514514+ ) node.children in
515515+ if rows_and_after <> [] then
516516+ let tbody_node = {
517517+ name = "tbody";
518518+ namespace = None;
519519+ data = "";
520520+ attrs = [];
521521+ children = rows_and_after;
522522+ parent = None;
523523+ doctype = None;
524524+ template_content = None;
525525+ location = None;
526526+ } in
527527+ (before @ [tbody_node], true)
528528+ else
529529+ (node.children, false)
530530+ end else
531531+ (node.children, false)
532532+ in
533533+ let _ = needs_tbody in (* suppress warning *)
534534+535535+ if all_text && not needs_tbody then begin
536536+ let text = String.concat "" (List.map (fun c -> c.data) node.children) in
537537+ let escaped = match child_text_mode with
538538+ | Normal -> escape_text text
539539+ | Raw -> text
540540+ | EscapableRaw -> escape_escapable_raw_text text
541541+ in
542542+ (prefix ^ open_tag ^ leading_newline ^ escaped ^ serialize_end_tag name, false)
543543+ end else begin
544544+ let buf = Buffer.create 256 in
545545+ Buffer.add_string buf (prefix ^ open_tag);
546546+ Buffer.add_string buf leading_newline;
547547+ let plaintext_found = ref false in
548548+ List.iter (fun child ->
549549+ if not !plaintext_found then begin
550550+ let (html, pt) = to_html_internal ~pretty ~indent_size ~indent:(indent + 1) ~text_mode:child_text_mode ~in_foreign:child_foreign ~ctx:(Some ctx) child in
551551+ if html <> "" then begin
552552+ Buffer.add_string buf newline;
553553+ Buffer.add_string buf html
554554+ end;
555555+ if pt then plaintext_found := true
556556+ end
557557+ ) children;
558558+ (* Only add closing tag if plaintext wasn't found *)
559559+ if not !plaintext_found then begin
560560+ Buffer.add_string buf newline;
561561+ Buffer.add_string buf (prefix ^ serialize_end_tag name)
562562+ end;
563563+ (Buffer.contents buf, !plaintext_found)
564564+ end
565565+ end
566566+ in
567567+568568+ (* Pop formatting element from stack *)
569569+ if is_fmt then
570570+ ctx.open_formatting <- (match ctx.open_formatting with _ :: rest -> rest | [] -> []);
571571+572572+ if entering_foreign then ctx.foreign_depth <- ctx.foreign_depth - 1;
573573+ result
144574 end
575575+576576+(* Public wrapper that discards the plaintext flag *)
577577+let to_html ?(pretty=true) ?(indent_size=2) ?(indent=0) ?(text_mode=Normal) node =
578578+ fst (to_html_internal ~pretty ~indent_size ~indent ~text_mode node)
145579146580(* Get qualified name for test format *)
147581let qualified_name node =
···226660 if strip then String.trim combined else combined
227661228662(* Streaming serialization to a Bytes.Writer.t
229229- Writes HTML directly to the writer without building intermediate strings *)
230230-let rec to_writer ?(pretty=true) ?(indent_size=2) ?(indent=0) (w : Bytes.Writer.t) node =
663663+ Writes HTML directly to the writer without building intermediate strings
664664+ Returns true if a plaintext element was encountered (stops further serialization) *)
665665+let rec to_writer_internal ?(pretty=true) ?(indent_size=2) ?(indent=0) ?(text_mode=Normal) (w : Bytes.Writer.t) node =
231666 let write s = Bytes.Writer.write_string w s in
232667 let write_prefix () = if pretty then write (String.make (indent * indent_size) ' ') in
233668 let write_newline () = if pretty then write "\n" in
234669670670+ (* Escape text based on mode *)
671671+ let escape_for_mode text = match text_mode with
672672+ | Normal -> escape_text text
673673+ | Raw -> text
674674+ | EscapableRaw -> escape_escapable_raw_text text
675675+ in
676676+235677 match node.name with
236678 | "#document" ->
237237- let rec write_children first = function
238238- | [] -> ()
239239- | child :: rest ->
240240- if not first && pretty then write_newline ();
241241- to_writer ~pretty ~indent_size ~indent:0 w child;
242242- write_children false rest
243243- in
244244- write_children true node.children
679679+ let plaintext_found = ref false in
680680+ let first = ref true in
681681+ List.iter (fun child ->
682682+ if not !plaintext_found then begin
683683+ if not !first && pretty then write_newline ();
684684+ let pt = to_writer_internal ~pretty ~indent_size ~indent:0 ~text_mode:Normal w child in
685685+ first := false;
686686+ if pt then plaintext_found := true
687687+ end
688688+ ) node.children;
689689+ !plaintext_found
245690246691 | "#document-fragment" ->
247247- let rec write_children first = function
248248- | [] -> ()
249249- | child :: rest ->
250250- if not first && pretty then write_newline ();
251251- to_writer ~pretty ~indent_size ~indent w child;
252252- write_children false rest
253253- in
254254- write_children true node.children
692692+ let plaintext_found = ref false in
693693+ let first = ref true in
694694+ List.iter (fun child ->
695695+ if not !plaintext_found then begin
696696+ if not !first && pretty then write_newline ();
697697+ let pt = to_writer_internal ~pretty ~indent_size ~indent ~text_mode w child in
698698+ first := false;
699699+ if pt then plaintext_found := true
700700+ end
701701+ ) node.children;
702702+ !plaintext_found
255703256704 | "#text" ->
257705 let text = node.data in
258258- if pretty then begin
706706+ if pretty && text_mode = Normal then begin
259707 let trimmed = String.trim text in
260708 if trimmed <> "" then begin
261709 write_prefix ();
262262- write (escape_text trimmed)
710710+ write (escape_for_mode trimmed)
263711 end
264712 end else
265265- write (escape_text text)
713713+ write (escape_for_mode text);
714714+ false
266715267716 | "#comment" ->
268717 write_prefix ();
269718 write "<!--";
270719 write node.data;
271271- write "-->"
720720+ write "-->";
721721+ false
272722273723 | "!doctype" ->
274724 write_prefix ();
275275- write "<!DOCTYPE html>"
725725+ write "<!DOCTYPE html>";
726726+ false
276727277728 | name ->
729729+ (* Sanitize element name to ensure valid HTML output *)
730730+ let name = sanitize_element_name name in
278731 write_prefix ();
279732 write (serialize_start_tag name node.attrs);
280733281281- if not (is_void name) then begin
282282- if node.children = [] then
283283- write (serialize_end_tag name)
284284- else begin
285285- (* Check if all children are text *)
286286- let all_text = List.for_all is_text node.children in
287287- if all_text && pretty then begin
288288- let text = String.concat "" (List.map (fun c -> c.data) node.children) in
289289- write (escape_text text);
290290- write (serialize_end_tag name)
291291- end else begin
292292- let rec write_children = function
293293- | [] -> ()
294294- | child :: rest ->
295295- write_newline ();
296296- to_writer ~pretty ~indent_size ~indent:(indent + 1) w child;
297297- write_children rest
298298- in
299299- write_children node.children;
734734+ if is_void name then
735735+ false (* No end tag for void elements *)
736736+ else if is_plaintext_element name then begin
737737+ (* plaintext is special: cannot be closed, content is raw *)
738738+ let text = String.concat "" (List.map (fun c -> c.data) node.children) in
739739+ write text;
740740+ (* No closing tag for plaintext, signal to stop further serialization *)
741741+ true
742742+ end else if node.children = [] then begin
743743+ write (serialize_end_tag name);
744744+ false
745745+ end else begin
746746+ (* Determine text mode for children based on this element *)
747747+ let child_text_mode =
748748+ if is_raw_text_element name then Raw
749749+ else if is_escapable_raw_text_element name then EscapableRaw
750750+ else Normal
751751+ in
752752+ (* Check if all children are text *)
753753+ let all_text = List.for_all is_text node.children in
754754+ (* Per HTML5 spec, pre/textarea/listing need leading newline doubled *)
755755+ let needs_leading_nl =
756756+ needs_leading_newline_preserved name &&
757757+ starts_with_newline (first_text_content node.children)
758758+ in
759759+ if all_text then begin
760760+ let text = String.concat "" (List.map (fun c -> c.data) node.children) in
761761+ let escaped = match child_text_mode with
762762+ | Normal -> escape_text text
763763+ | Raw -> text
764764+ | EscapableRaw -> escape_escapable_raw_text text
765765+ in
766766+ if needs_leading_nl then write "\n";
767767+ write escaped;
768768+ write (serialize_end_tag name);
769769+ false
770770+ end else begin
771771+ if needs_leading_nl then write "\n";
772772+ let plaintext_found = ref false in
773773+ List.iter (fun child ->
774774+ if not !plaintext_found then begin
775775+ write_newline ();
776776+ let pt = to_writer_internal ~pretty ~indent_size ~indent:(indent + 1) ~text_mode:child_text_mode w child in
777777+ if pt then plaintext_found := true
778778+ end
779779+ ) node.children;
780780+ (* Only add closing tag if plaintext wasn't found *)
781781+ if not !plaintext_found then begin
300782 write_newline ();
301783 write_prefix ();
302784 write (serialize_end_tag name)
303303- end
785785+ end;
786786+ !plaintext_found
304787 end
305788 end
789789+790790+(* Public wrapper that discards the plaintext flag *)
791791+let to_writer ?(pretty=true) ?(indent_size=2) ?(indent=0) (w : Bytes.Writer.t) node =
792792+ ignore (to_writer_internal ~pretty ~indent_size ~indent w node)
+3
lib/html5rw/parser/parser_tree_builder.ml
···208208 end
209209210210let insert_element t name ?(namespace=None) ?(push=false) attrs =
211211+ (* Reset ignore_lf flag - per HTML5 spec, only the immediately next token
212212+ after pre/textarea/listing should be checked for leading LF *)
213213+ t.ignore_lf <- false;
211214 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in
212215 let node = Dom.create_element name ~namespace ~attrs ~location () in
213216 let (parent, before) = appropriate_insertion_place t in
+13-1
lib/html5rw/tokenizer/tokenizer_impl.ml
···711711 t.state <- Tokenizer_state.Bogus_comment
712712713713 and state_tag_name () =
714714- match Tokenizer_stream.consume t.stream with
714714+ match Tokenizer_stream.peek t.stream with
715715 | Some ('\t' | '\n' | '\x0C' | ' ') ->
716716+ Tokenizer_stream.advance t.stream;
716717 t.state <- Tokenizer_state.Before_attribute_name
717718 | Some '/' ->
719719+ Tokenizer_stream.advance t.stream;
718720 t.state <- Tokenizer_state.Self_closing_start_tag
719721 | Some '>' ->
722722+ Tokenizer_stream.advance t.stream;
720723 t.state <- Tokenizer_state.Data;
721724 emit_current_tag ()
722725 | Some '\x00' ->
726726+ Tokenizer_stream.advance t.stream;
723727 error t "unexpected-null-character";
724728 Buffer.add_string t.current_tag_name "\xEF\xBF\xBD"
729729+ | Some '<' ->
730730+ (* Per HTML5 spec section 13.2.5.8: '<' is "anything else" - append to tag name.
731731+ Note: The previous implementation incorrectly emitted the tag and switched
732732+ to tag open state. The spec says to just append the character to the tag name
733733+ without emitting an error. *)
734734+ Tokenizer_stream.advance t.stream;
735735+ Buffer.add_char t.current_tag_name '<'
725736 | Some c ->
737737+ Tokenizer_stream.advance t.stream;
726738 check_control_char c;
727739 Buffer.add_char t.current_tag_name (ascii_lower c)
728740 | None -> ()