···1+(** Simple AFL-compatible fuzzer for html5rw
2+3+ This fuzzer reads input from a file (passed as command line argument)
4+ and runs several property tests on it. It uses AflPersistent for
5+ efficient AFL fuzzing.
6+*)
7+8+(* Helper to create a bytes reader from a string *)
9+let reader_of_string s = Bytesrw.Bytes.Reader.of_string s
10+11+(* Serialize a parse result to string *)
12+let serialize result =
13+ Html5rw.to_string ~pretty:false result
14+15+(* Main fuzzing function - returns true if test passes *)
16+let fuzz_input input =
17+ try
18+ (* Test 1: Parse should not crash *)
19+ let result = Html5rw.parse (reader_of_string input) in
20+21+ (* Test 2: Serialization should not crash *)
22+ let serialized = serialize result in
23+24+ (* Test 3: Reparse should not crash *)
25+ let result2 = Html5rw.parse (reader_of_string serialized) in
26+ let serialized2 = serialize result2 in
27+28+ (* Test 4: Roundtrip should stabilize (s2 == s3) *)
29+ let result3 = Html5rw.parse (reader_of_string serialized2) in
30+ let serialized3 = serialize result3 in
31+32+ if serialized2 <> serialized3 then begin
33+ Printf.eprintf "ROUNDTRIP BUG:\n";
34+ Printf.eprintf "Input: %s\n" (String.escaped (String.sub input 0 (min 100 (String.length input))));
35+ Printf.eprintf "s2: %s\n" (String.escaped (String.sub serialized2 0 (min 100 (String.length serialized2))));
36+ Printf.eprintf "s3: %s\n" (String.escaped (String.sub serialized3 0 (min 100 (String.length serialized3))));
37+ (* Signal a bug to AFL by aborting *)
38+ assert false
39+ end;
40+41+ (* Test 5: Text extraction should not crash *)
42+ let _ = Html5rw.to_text result in
43+44+ (* Test 6: Clone should produce identical output *)
45+ let root = Html5rw.root result in
46+ let cloned = Html5rw.clone ~deep:true root in
47+ let original_html = Html5rw.Dom.to_html ~pretty:false root in
48+ let cloned_html = Html5rw.Dom.to_html ~pretty:false cloned in
49+50+ if original_html <> cloned_html then begin
51+ Printf.eprintf "CLONE BUG:\n";
52+ Printf.eprintf "Original: %s\n" (String.escaped (String.sub original_html 0 (min 100 (String.length original_html))));
53+ Printf.eprintf "Cloned: %s\n" (String.escaped (String.sub cloned_html 0 (min 100 (String.length cloned_html))));
54+ assert false
55+ end;
56+57+ (* Test 7: Selector queries should not crash (test a few common patterns) *)
58+ let selectors = ["*"; "div"; ".class"; "#id"; "div > p"; "[attr]"] in
59+ List.iter (fun sel ->
60+ try
61+ let _ = Html5rw.query result sel in ()
62+ with Html5rw.Selector.Selector_error _ -> ()
63+ ) selectors;
64+65+ true
66+ with
67+ | Assert_failure _ ->
68+ (* Re-raise assert failures so AFL sees the crash *)
69+ raise (Assert_failure ("", 0, 0))
70+ | _ ->
71+ (* Other exceptions are expected for malformed input *)
72+ true
73+74+(* Read file contents *)
75+let read_file filename =
76+ let ic = open_in_bin filename in
77+ let n = in_channel_length ic in
78+ let s = really_input_string ic n in
79+ close_in ic;
80+ s
81+82+(* Main entry point *)
83+let () =
84+ (* Use AflPersistent for efficient AFL fuzzing *)
85+ AflPersistent.run (fun () ->
86+ if Array.length Sys.argv < 2 then begin
87+ Printf.eprintf "Usage: %s <input_file>\n" Sys.argv.(0);
88+ exit 1
89+ end;
90+ let input = read_file Sys.argv.(1) in
91+ ignore (fuzz_input input)
92+ )
···1+(* Resource exhaustion tests for HTML5rw
2+ Tests for algorithmic complexity bugs, memory issues, and DoS vectors *)
3+4+let reader_of_string s = Bytesrw.Bytes.Reader.of_string s
5+6+(* Timing helper *)
7+let time_it f =
8+ let start = Unix.gettimeofday () in
9+ let result = f () in
10+ let elapsed = Unix.gettimeofday () -. start in
11+ (result, elapsed)
12+13+(* Test 1: Deeply nested elements *)
14+(* Note: Deep nesting can exhibit O(nยฒ) complexity in tree construction.
15+ The timing thresholds are set to catch severe regressions while allowing
16+ for some expected slowdown with very deep nesting. *)
17+let test_deep_nesting depth =
18+ let input = String.concat "" (List.init depth (fun _ -> "<div>")) in
19+ let (_, elapsed) = time_it (fun () ->
20+ try
21+ let r = Html5rw.parse (reader_of_string input) in
22+ let _ = Html5rw.to_string r in
23+ true
24+ with _ -> false
25+ ) in
26+ (* Allow quadratic behavior up to a reasonable limit for very deep nesting.
27+ HTML5 spec allows implementations to impose nesting limits for DoS protection. *)
28+ let max_time =
29+ if depth <= 1000 then float depth *. 0.001 +. 1.0
30+ else float depth *. 0.02 +. 30.0 (* Very lenient for extreme depth - known O(nยฒ) case *)
31+ in
32+ if elapsed > max_time then begin
33+ Printf.printf "SLOW: deep_nesting(%d) took %.3fs (max %.3fs)\n" depth elapsed max_time;
34+ false
35+ end else
36+ true
37+38+(* Test 2: Wide trees (many siblings) *)
39+let test_wide_tree width =
40+ let children = String.concat "" (List.init width (fun i -> Printf.sprintf "<span>%d</span>" i)) in
41+ let input = "<div>" ^ children ^ "</div>" in
42+ let (_, elapsed) = time_it (fun () ->
43+ try
44+ let r = Html5rw.parse (reader_of_string input) in
45+ let _ = Html5rw.to_string r in
46+ true
47+ with _ -> false
48+ ) in
49+ let max_time = float width *. 0.0001 +. 0.5 in
50+ if elapsed > max_time then begin
51+ Printf.printf "SLOW: wide_tree(%d) took %.3fs (max %.3fs)\n" width elapsed max_time;
52+ false
53+ end else
54+ true
55+56+(* Test 3: Huge text nodes *)
57+let test_huge_text size =
58+ let text = String.make size 'x' in
59+ let input = "<div>" ^ text ^ "</div>" in
60+ let (_, elapsed) = time_it (fun () ->
61+ try
62+ let r = Html5rw.parse (reader_of_string input) in
63+ let _ = Html5rw.to_string r in
64+ true
65+ with _ -> false
66+ ) in
67+ let max_time = float size *. 0.00001 +. 0.5 in
68+ if elapsed > max_time then begin
69+ Printf.printf "SLOW: huge_text(%d) took %.3fs (max %.3fs)\n" size elapsed max_time;
70+ false
71+ end else
72+ true
73+74+(* Test 4: Many attributes *)
75+let test_many_attrs count =
76+ let attrs = String.concat " " (List.init count (fun i -> Printf.sprintf "a%d=\"v%d\"" i i)) in
77+ let input = Printf.sprintf "<div %s></div>" attrs in
78+ let (_, elapsed) = time_it (fun () ->
79+ try
80+ let r = Html5rw.parse (reader_of_string input) in
81+ let _ = Html5rw.to_string r in
82+ true
83+ with _ -> false
84+ ) in
85+ let max_time = float count *. 0.0001 +. 0.5 in
86+ if elapsed > max_time then begin
87+ Printf.printf "SLOW: many_attrs(%d) took %.3fs (max %.3fs)\n" count elapsed max_time;
88+ false
89+ end else
90+ true
91+92+(* Test 5: Huge attribute values *)
93+let test_huge_attr_value size =
94+ let value = String.make size 'x' in
95+ let input = Printf.sprintf "<div data-x=\"%s\"></div>" value in
96+ let (_, elapsed) = time_it (fun () ->
97+ try
98+ let r = Html5rw.parse (reader_of_string input) in
99+ let _ = Html5rw.to_string r in
100+ true
101+ with _ -> false
102+ ) in
103+ let max_time = float size *. 0.00001 +. 0.5 in
104+ if elapsed > max_time then begin
105+ Printf.printf "SLOW: huge_attr_value(%d) took %.3fs (max %.3fs)\n" size elapsed max_time;
106+ false
107+ end else
108+ true
109+110+(* Test 6: Repeated unclosed p tags (adoption agency stress test) *)
111+let test_repeated_p count =
112+ let input = String.concat "" (List.init count (fun _ -> "<p>")) in
113+ let (_, elapsed) = time_it (fun () ->
114+ try
115+ let r = Html5rw.parse (reader_of_string input) in
116+ let _ = Html5rw.to_string r in
117+ true
118+ with _ -> false
119+ ) in
120+ (* This could trigger O(n^2) behavior in naive implementations *)
121+ let max_time = float count *. 0.001 +. 1.0 in
122+ if elapsed > max_time then begin
123+ Printf.printf "SLOW: repeated_p(%d) took %.3fs (max %.3fs)\n" count elapsed max_time;
124+ false
125+ end else
126+ true
127+128+(* Test 7: Nested formatting elements (adoption agency stress) *)
129+let test_nested_formatting depth =
130+ let tags = [| "a"; "b"; "i"; "em"; "strong" |] in
131+ let open_tags = String.concat "" (List.init depth (fun i -> "<" ^ tags.(i mod 5) ^ ">")) in
132+ let input = open_tags ^ "text" in
133+ let (_, elapsed) = time_it (fun () ->
134+ try
135+ let r = Html5rw.parse (reader_of_string input) in
136+ let _ = Html5rw.to_string r in
137+ true
138+ with _ -> false
139+ ) in
140+ let max_time = float depth *. 0.001 +. 0.5 in
141+ if elapsed > max_time then begin
142+ Printf.printf "SLOW: nested_formatting(%d) took %.3fs (max %.3fs)\n" depth elapsed max_time;
143+ false
144+ end else
145+ true
146+147+(* Test 8: Table with many cells *)
148+let test_large_table rows cols =
149+ let cells = String.concat "" (List.init cols (fun _ -> "<td>x</td>")) in
150+ let row = "<tr>" ^ cells ^ "</tr>" in
151+ let tbody = String.concat "" (List.init rows (fun _ -> row)) in
152+ let input = "<table><tbody>" ^ tbody ^ "</tbody></table>" in
153+ let (_, elapsed) = time_it (fun () ->
154+ try
155+ let r = Html5rw.parse (reader_of_string input) in
156+ let _ = Html5rw.to_string r in
157+ true
158+ with _ -> false
159+ ) in
160+ let total = rows * cols in
161+ let max_time = float total *. 0.0001 +. 1.0 in
162+ if elapsed > max_time then begin
163+ Printf.printf "SLOW: large_table(%dx%d) took %.3fs (max %.3fs)\n" rows cols elapsed max_time;
164+ false
165+ end else
166+ true
167+168+(* Test 9: Deeply nested tables *)
169+let test_nested_tables depth =
170+ let rec make_table d =
171+ if d = 0 then "x"
172+ else "<table><tr><td>" ^ make_table (d - 1) ^ "</td></tr></table>"
173+ in
174+ let input = make_table depth in
175+ let (_, elapsed) = time_it (fun () ->
176+ try
177+ let r = Html5rw.parse (reader_of_string input) in
178+ let _ = Html5rw.to_string r in
179+ true
180+ with _ -> false
181+ ) in
182+ let max_time = float depth *. 0.01 +. 0.5 in
183+ if elapsed > max_time then begin
184+ Printf.printf "SLOW: nested_tables(%d) took %.3fs (max %.3fs)\n" depth elapsed max_time;
185+ false
186+ end else
187+ true
188+189+(* Test 10: Many entity references *)
190+let test_many_entities count =
191+ let entities = String.concat "" (List.init count (fun _ -> "&")) in
192+ let input = "<div>" ^ entities ^ "</div>" in
193+ let (_, elapsed) = time_it (fun () ->
194+ try
195+ let r = Html5rw.parse (reader_of_string input) in
196+ let _ = Html5rw.to_string r in
197+ true
198+ with _ -> false
199+ ) in
200+ let max_time = float count *. 0.0001 +. 0.5 in
201+ if elapsed > max_time then begin
202+ Printf.printf "SLOW: many_entities(%d) took %.3fs (max %.3fs)\n" count elapsed max_time;
203+ false
204+ end else
205+ true
206+207+(* Run all exhaustion tests *)
208+let run_all_tests () =
209+ let tests = [
210+ ("deep_nesting_100", fun () -> test_deep_nesting 100);
211+ ("deep_nesting_1000", fun () -> test_deep_nesting 1000);
212+ ("deep_nesting_5000", fun () -> test_deep_nesting 5000);
213+ ("wide_tree_100", fun () -> test_wide_tree 100);
214+ ("wide_tree_1000", fun () -> test_wide_tree 1000);
215+ ("wide_tree_10000", fun () -> test_wide_tree 10000);
216+ ("huge_text_10000", fun () -> test_huge_text 10000);
217+ ("huge_text_100000", fun () -> test_huge_text 100000);
218+ ("many_attrs_100", fun () -> test_many_attrs 100);
219+ ("many_attrs_1000", fun () -> test_many_attrs 1000);
220+ ("huge_attr_10000", fun () -> test_huge_attr_value 10000);
221+ ("huge_attr_100000", fun () -> test_huge_attr_value 100000);
222+ ("repeated_p_100", fun () -> test_repeated_p 100);
223+ ("repeated_p_500", fun () -> test_repeated_p 500);
224+ ("nested_formatting_50", fun () -> test_nested_formatting 50);
225+ ("nested_formatting_200", fun () -> test_nested_formatting 200);
226+ ("large_table_10x10", fun () -> test_large_table 10 10);
227+ ("large_table_100x100", fun () -> test_large_table 100 100);
228+ ("nested_tables_10", fun () -> test_nested_tables 10);
229+ ("nested_tables_50", fun () -> test_nested_tables 50);
230+ ("many_entities_1000", fun () -> test_many_entities 1000);
231+ ("many_entities_10000", fun () -> test_many_entities 10000);
232+ ] in
233+234+ let passed = ref 0 in
235+ let failed = ref 0 in
236+237+ List.iter (fun (name, test) ->
238+ Printf.printf "Running %s... %!" name;
239+ if test () then begin
240+ Printf.printf "PASS\n%!";
241+ incr passed
242+ end else begin
243+ Printf.printf "FAIL\n%!";
244+ incr failed
245+ end
246+ ) tests;
247+248+ Printf.printf "\n=== Summary ===\n";
249+ Printf.printf "Passed: %d\n" !passed;
250+ Printf.printf "Failed: %d\n" !failed;
251+252+ !failed = 0
253+254+let () =
255+ if not (run_all_tests ()) then
256+ exit 1
···1+(* Property-based testing for HTML5rw
2+ Tests invariants that should always hold regardless of input *)
3+4+let reader_of_string s = Bytesrw.Bytes.Reader.of_string s
5+6+(* Property 1: Parsing never raises exceptions on any input *)
7+let test_parse_no_exception input =
8+ try
9+ let _ = Html5rw.parse (reader_of_string input) in
10+ true
11+ with _ -> false
12+13+(* Property 2: Serialization never raises exceptions *)
14+let test_serialize_no_exception input =
15+ try
16+ let result = Html5rw.parse (reader_of_string input) in
17+ let _ = Html5rw.to_string result in
18+ true
19+ with _ -> false
20+21+(* Property 3: Serialized output is never longer than a reasonable bound *)
22+let test_output_bounded input =
23+ try
24+ let result = Html5rw.parse (reader_of_string input) in
25+ let output = Html5rw.to_string ~pretty:false result in
26+ (* Output should not be more than 10x input + base HTML structure *)
27+ String.length output <= (String.length input * 10) + 1000
28+ with _ -> false
29+30+(* Property 4: DOM tree depth is bounded *)
31+let rec tree_depth node =
32+ let child_depths = List.map tree_depth node.Html5rw.Dom.children in
33+ 1 + (List.fold_left max 0 child_depths)
34+35+let test_depth_bounded input =
36+ try
37+ let result = Html5rw.parse (reader_of_string input) in
38+ let depth = tree_depth (Html5rw.root result) in
39+ (* Depth should not exceed input length (at most one level per char) *)
40+ depth <= String.length input + 10
41+ with _ -> false
42+43+(* Property 5: All text content from input appears somewhere in DOM *)
44+let rec collect_text node =
45+ if node.Html5rw.Dom.name = "#text" then
46+ [node.Html5rw.Dom.data]
47+ else
48+ List.concat_map collect_text node.Html5rw.Dom.children
49+50+let test_text_preserved input =
51+ try
52+ let result = Html5rw.parse (reader_of_string input) in
53+ let dom_text = String.concat "" (collect_text (Html5rw.root result)) in
54+ (* Every non-tag character should appear in text content or be structural *)
55+ let input_text = Str.global_replace (Str.regexp "<[^>]*>") "" input in
56+ let input_text = Str.global_replace (Str.regexp "&[a-zA-Z]+;") "" input_text in
57+ (* Relaxed check: DOM text should have substantial overlap with input text *)
58+ String.length dom_text >= (String.length input_text / 4) || String.length input_text < 10
59+ with _ -> true (* Parse errors are ok *)
60+61+(* Property 6: Element count is bounded by tag markers in input *)
62+let rec count_elements node =
63+ let is_element = not (String.length node.Html5rw.Dom.name > 0 && node.Html5rw.Dom.name.[0] = '#') in
64+ let child_count = List.fold_left (+) 0 (List.map count_elements node.Html5rw.Dom.children) in
65+ (if is_element then 1 else 0) + child_count
66+67+let count_char c s =
68+ let count = ref 0 in
69+ String.iter (fun ch -> if ch = c then incr count) s;
70+ !count
71+72+let test_element_count_bounded input =
73+ try
74+ let result = Html5rw.parse (reader_of_string input) in
75+ let elem_count = count_elements (Html5rw.root result) in
76+ let lt_count = count_char '<' input in
77+ (* Element count should not exceed < count + implicit elements (html, head, body) *)
78+ elem_count <= lt_count + 10
79+ with _ -> false
80+81+(* Property 7: Attribute values survive roundtrip (modulo escaping) *)
82+let rec collect_attrs node =
83+ let own_attrs = node.Html5rw.Dom.attrs in
84+ let child_attrs = List.concat_map collect_attrs node.Html5rw.Dom.children in
85+ own_attrs @ child_attrs
86+87+let unescape_html s =
88+ let s = Str.global_replace (Str.regexp "&") "&" s in
89+ let s = Str.global_replace (Str.regexp "<") "<" s in
90+ let s = Str.global_replace (Str.regexp ">") ">" s in
91+ let s = Str.global_replace (Str.regexp """) "\"" s in
92+ let s = Str.global_replace (Str.regexp "'") "'" s in
93+ s
94+95+let test_attr_roundtrip input =
96+ try
97+ let r1 = Html5rw.parse (reader_of_string input) in
98+ let s1 = Html5rw.to_string ~pretty:false r1 in
99+ let r2 = Html5rw.parse (reader_of_string s1) in
100+ let attrs1 = collect_attrs (Html5rw.root r1) in
101+ let attrs2 = collect_attrs (Html5rw.root r2) in
102+ (* After roundtrip, attribute values should match (modulo escaping) *)
103+ let normalize_attrs attrs =
104+ List.sort compare (List.map (fun (k, v) -> (k, unescape_html v)) attrs)
105+ in
106+ normalize_attrs attrs1 = normalize_attrs attrs2 ||
107+ (* Allow some attrs to be dropped if they have invalid names *)
108+ List.length attrs2 <= List.length attrs1
109+ with _ -> true
110+111+(* Property 8: Idempotent after first roundtrip *)
112+let test_idempotent input =
113+ try
114+ let r1 = Html5rw.parse (reader_of_string input) in
115+ let s1 = Html5rw.to_string ~pretty:false r1 in
116+ let r2 = Html5rw.parse (reader_of_string s1) in
117+ let s2 = Html5rw.to_string ~pretty:false r2 in
118+ let r3 = Html5rw.parse (reader_of_string s2) in
119+ let s3 = Html5rw.to_string ~pretty:false r3 in
120+ s2 = s3
121+ with _ -> false
122+123+(* Run all property tests *)
124+let run_all_properties input =
125+ let results = [
126+ ("parse_no_exception", test_parse_no_exception input);
127+ ("serialize_no_exception", test_serialize_no_exception input);
128+ ("output_bounded", test_output_bounded input);
129+ ("depth_bounded", test_depth_bounded input);
130+ ("text_preserved", test_text_preserved input);
131+ ("element_count_bounded", test_element_count_bounded input);
132+ ("attr_roundtrip", test_attr_roundtrip input);
133+ ("idempotent", test_idempotent input);
134+ ] in
135+ let failures = List.filter (fun (_, ok) -> not ok) results in
136+ if failures <> [] then begin
137+ Printf.printf "PROPERTY FAILURES for input: %s\n" (String.escaped (String.sub input 0 (min 100 (String.length input))));
138+ List.iter (fun (name, _) -> Printf.printf " - %s\n" name) failures;
139+ false
140+ end else
141+ true
142+143+(* AFL entry point *)
144+let () =
145+ AflPersistent.run (fun () ->
146+ let input = In_channel.input_all In_channel.stdin in
147+ if not (run_all_properties input) then
148+ exit 1
149+ )
···2021let is_void name = Hashtbl.mem void_elements_tbl name
22000000000000000000000000000000000000000000000000000000000000000000000000000000000023(* Foreign attribute adjustments for test output *)
24let foreign_attr_adjustments = [
25 "xlink:actuate"; "xlink:arcrole"; "xlink:href"; "xlink:role";
···39 ) text;
40 Buffer.contents buf
41000000000042(* Choose quote character for attribute value *)
43let choose_attr_quote value =
44 if String.contains value '"' && not (String.contains value '\'') then '\''
45 else '"'
4647-(* Escape attribute value *)
48let escape_attr_value value quote_char =
49 let buf = Buffer.create (String.length value) in
50 String.iter (fun c ->
51 match c with
52 | '&' -> Buffer.add_string buf "&"
053 | '"' when quote_char = '"' -> Buffer.add_string buf """
54 | '\'' when quote_char = '\'' -> Buffer.add_string buf "'"
55 | c -> Buffer.add_char buf c
···68 ) value;
69 not !invalid
7071-(* Serialize start tag - per WHATWG spec, attribute values must be quoted *)
00000000000000000000000000000000000000000000000000072let serialize_start_tag name attrs =
73 let buf = Buffer.create 64 in
74 Buffer.add_char buf '<';
75 Buffer.add_string buf name;
76 List.iter (fun (key, value) ->
77- Buffer.add_char buf ' ';
78- Buffer.add_string buf key;
79- if value <> "" then begin
80- (* WHATWG serialization algorithm requires double quotes around values *)
81- Buffer.add_char buf '=';
82- Buffer.add_char buf '"';
83- Buffer.add_string buf (escape_attr_value value '"');
84- Buffer.add_char buf '"'
00085 end
86 ) attrs;
87 Buffer.add_char buf '>';
···91let serialize_end_tag name =
92 "</" ^ name ^ ">"
9394-(* Convert node to HTML string *)
95-let rec to_html ?(pretty=true) ?(indent_size=2) ?(indent=0) node =
000000000000000000000000000000000000000000000000000000000000000000096 let prefix = if pretty then String.make (indent * indent_size) ' ' else "" in
97 let newline = if pretty then "\n" else "" in
98000000099 match node.name with
100 | "#document" ->
101- let parts = List.map (to_html ~pretty ~indent_size ~indent:0) node.children in
102- String.concat newline (List.filter (fun s -> s <> "") parts)
0000000000000103104 | "#document-fragment" ->
105- let parts = List.map (to_html ~pretty ~indent_size ~indent) node.children in
106- String.concat newline (List.filter (fun s -> s <> "") parts)
0000000000000107108 | "#text" ->
109 let text = node.data in
110- if pretty then
111 let trimmed = String.trim text in
112- if trimmed = "" then ""
113- else prefix ^ escape_text trimmed
114- else escape_text text
115116 | "#comment" ->
117- prefix ^ "<!--" ^ node.data ^ "-->"
118119 | "!doctype" ->
120- prefix ^ "<!DOCTYPE html>"
121122 | name ->
123- let open_tag = serialize_start_tag name node.attrs in
00000000000000000000000000000000000000124125- if is_void name then
126- prefix ^ open_tag
127- else if node.children = [] then
128- prefix ^ open_tag ^ serialize_end_tag name
0000000000000000000000000000000000000000000000000000000000129 else begin
130- (* Check if all children are text *)
131- let all_text = List.for_all is_text node.children in
132- if all_text && pretty then
133- let text = String.concat "" (List.map (fun c -> c.data) node.children) in
134- prefix ^ open_tag ^ escape_text text ^ serialize_end_tag name
135- else begin
136- let parts = [prefix ^ open_tag] in
137- let child_parts = List.filter_map (fun child ->
138- let html = to_html ~pretty ~indent_size ~indent:(indent + 1) child in
139- if html = "" then None else Some html
140- ) node.children in
141- let parts = parts @ child_parts @ [prefix ^ serialize_end_tag name] in
142- String.concat newline parts
143- end
000000000000000000000000000000000000000000000000000000000000000000000000000000000000000144 end
0000145146(* Get qualified name for test format *)
147let qualified_name node =
···226 if strip then String.trim combined else combined
227228(* Streaming serialization to a Bytes.Writer.t
229- Writes HTML directly to the writer without building intermediate strings *)
230-let rec to_writer ?(pretty=true) ?(indent_size=2) ?(indent=0) (w : Bytes.Writer.t) node =
0231 let write s = Bytes.Writer.write_string w s in
232 let write_prefix () = if pretty then write (String.make (indent * indent_size) ' ') in
233 let write_newline () = if pretty then write "\n" in
2340000000235 match node.name with
236 | "#document" ->
237- let rec write_children first = function
238- | [] -> ()
239- | child :: rest ->
240- if not first && pretty then write_newline ();
241- to_writer ~pretty ~indent_size ~indent:0 w child;
242- write_children false rest
243- in
244- write_children true node.children
000245246 | "#document-fragment" ->
247- let rec write_children first = function
248- | [] -> ()
249- | child :: rest ->
250- if not first && pretty then write_newline ();
251- to_writer ~pretty ~indent_size ~indent w child;
252- write_children false rest
253- in
254- write_children true node.children
000255256 | "#text" ->
257 let text = node.data in
258- if pretty then begin
259 let trimmed = String.trim text in
260 if trimmed <> "" then begin
261 write_prefix ();
262- write (escape_text trimmed)
263 end
264 end else
265- write (escape_text text)
0266267 | "#comment" ->
268 write_prefix ();
269 write "<!--";
270 write node.data;
271- write "-->"
0272273 | "!doctype" ->
274 write_prefix ();
275- write "<!DOCTYPE html>"
0276277 | name ->
00278 write_prefix ();
279 write (serialize_start_tag name node.attrs);
280281- if not (is_void name) then begin
282- if node.children = [] then
283- write (serialize_end_tag name)
284- else begin
285- (* Check if all children are text *)
286- let all_text = List.for_all is_text node.children in
287- if all_text && pretty then begin
288- let text = String.concat "" (List.map (fun c -> c.data) node.children) in
289- write (escape_text text);
290- write (serialize_end_tag name)
291- end else begin
292- let rec write_children = function
293- | [] -> ()
294- | child :: rest ->
295- write_newline ();
296- to_writer ~pretty ~indent_size ~indent:(indent + 1) w child;
297- write_children rest
298- in
299- write_children node.children;
00000000000000000000000000000300 write_newline ();
301 write_prefix ();
302 write (serialize_end_tag name)
303- end
0304 end
305 end
0000
···2021let is_void name = Hashtbl.mem void_elements_tbl name
2223+(* Raw text elements - content should NOT be escaped at all
24+ Per WHATWG spec: script, style, xmp, iframe, noembed, noframes
25+ Note: noscript depends on scripting being enabled (we assume it is)
26+ Note: plaintext is handled specially - it has no closing tag *)
27+let raw_text_elements_tbl =
28+ let elements = [
29+ "script"; "style"; "xmp"; "iframe"; "noembed"; "noframes"; "noscript"
30+ ] in
31+ let tbl = Hashtbl.create (List.length elements) in
32+ List.iter (fun e -> Hashtbl.add tbl e ()) elements;
33+ tbl
34+35+let is_raw_text_element name = Hashtbl.mem raw_text_elements_tbl name
36+37+(* plaintext is special: it can never be closed, everything after is raw text.
38+ We treat it as raw text but without a closing tag. *)
39+let is_plaintext_element name = name = "plaintext"
40+41+(* Escapable raw text elements - only & needs to be escaped *)
42+let escapable_raw_text_elements_tbl =
43+ let elements = ["textarea"; "title"] in
44+ let tbl = Hashtbl.create (List.length elements) in
45+ List.iter (fun e -> Hashtbl.add tbl e ()) elements;
46+ tbl
47+48+let is_escapable_raw_text_element name = Hashtbl.mem escapable_raw_text_elements_tbl name
49+50+(* HTML breakout elements - these break out of foreign content (SVG/MathML) when parsed.
51+ Per WHATWG spec section 13.2.6.5, these start tags cause exit from foreign content. *)
52+let html_breakout_elements_tbl =
53+ let elements = [
54+ "b"; "big"; "blockquote"; "body"; "br"; "center"; "code"; "dd"; "div"; "dl"; "dt";
55+ "em"; "embed"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "head"; "hr"; "i"; "img"; "li";
56+ "listing"; "menu"; "meta"; "nobr"; "ol"; "p"; "pre"; "ruby"; "s"; "small"; "span";
57+ "strong"; "strike"; "sub"; "sup"; "table"; "tt"; "u"; "ul"; "var"
58+ ] in
59+ let tbl = Hashtbl.create (List.length elements) in
60+ List.iter (fun e -> Hashtbl.add tbl e ()) elements;
61+ tbl
62+63+let is_html_breakout_element name = Hashtbl.mem html_breakout_elements_tbl (String.lowercase_ascii name)
64+65+(* HTML integration points in SVG - these allow HTML content inside SVG *)
66+let is_svg_html_integration_point name =
67+ let name = String.lowercase_ascii name in
68+ name = "foreignobject" || name = "desc" || name = "title"
69+70+(* Formatting elements - these are in the list of active formatting elements
71+ and the adoption agency algorithm handles them specially when block elements appear *)
72+let formatting_elements_tbl =
73+ let elements = ["a"; "b"; "big"; "code"; "em"; "font"; "i"; "nobr"; "s"; "small"; "strike"; "strong"; "tt"; "u"] in
74+ let tbl = Hashtbl.create (List.length elements) in
75+ List.iter (fun e -> Hashtbl.add tbl e ()) elements;
76+ tbl
77+78+let is_formatting_element name = Hashtbl.mem formatting_elements_tbl (String.lowercase_ascii name)
79+80+(* Block elements that trigger adoption agency when inside formatting elements *)
81+let is_block_element name =
82+ let name = String.lowercase_ascii name in
83+ List.mem name ["div"; "p"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "blockquote"; "pre"; "ol"; "ul"; "dl";
84+ "table"; "form"; "fieldset"; "address"; "article"; "aside"; "footer"; "header"; "main";
85+ "nav"; "section"; "figure"; "figcaption"; "details"; "summary"]
86+87+(* Elements where a leading newline in content must be doubled during serialization.
88+ Per HTML5 spec, the parser strips a single leading newline after opening tags
89+ for pre, textarea, and listing elements. To preserve content, we must emit
90+ an extra newline if the content starts with one. *)
91+let needs_leading_newline_preserved name =
92+ name = "pre" || name = "textarea" || name = "listing"
93+94+(* Check if text content starts with a newline (LF) *)
95+let starts_with_newline text =
96+ String.length text > 0 && text.[0] = '\n'
97+98+(* Get the first text content from children, if any *)
99+let first_text_content children =
100+ match children with
101+ | [] -> ""
102+ | first :: _ when first.name = "#text" -> first.data
103+ | _ -> ""
104+105(* Foreign attribute adjustments for test output *)
106let foreign_attr_adjustments = [
107 "xlink:actuate"; "xlink:arcrole"; "xlink:href"; "xlink:role";
···121 ) text;
122 Buffer.contents buf
123124+(* Escape text for escapable raw text elements (only & needs escaping) *)
125+let escape_escapable_raw_text text =
126+ let buf = Buffer.create (String.length text) in
127+ String.iter (fun c ->
128+ match c with
129+ | '&' -> Buffer.add_string buf "&"
130+ | c -> Buffer.add_char buf c
131+ ) text;
132+ Buffer.contents buf
133+134(* Choose quote character for attribute value *)
135let choose_attr_quote value =
136 if String.contains value '"' && not (String.contains value '\'') then '\''
137 else '"'
138139+(* Escape attribute value - must escape &, quotes, and < for safe reparsing *)
140let escape_attr_value value quote_char =
141 let buf = Buffer.create (String.length value) in
142 String.iter (fun c ->
143 match c with
144 | '&' -> Buffer.add_string buf "&"
145+ | '<' -> Buffer.add_string buf "<"
146 | '"' when quote_char = '"' -> Buffer.add_string buf """
147 | '\'' when quote_char = '\'' -> Buffer.add_string buf "'"
148 | c -> Buffer.add_char buf c
···161 ) value;
162 not !invalid
163164+(* Check if a name is valid for serialization - rejects control chars,
165+ whitespace, and special chars like quotes, angle brackets, slash, equals *)
166+let is_valid_name ?(allow_lt=false) name =
167+ if String.length name = 0 then false
168+ else
169+ let valid = ref true in
170+ String.iter (fun c ->
171+ let code = Char.code c in
172+ if code <= 0x1F || (code >= 0x7F && code <= 0x9F) ||
173+ c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' ||
174+ c = '"' || c = '\'' || c = '>' || c = '/' || c = '=' ||
175+ (c = '<' && not allow_lt) then
176+ valid := false
177+ ) name;
178+ !valid
179+180+let is_valid_attr_name = is_valid_name ~allow_lt:false
181+182+(* Element names must be ASCII-only for consistent roundtrip parsing *)
183+let is_valid_element_name name =
184+ if String.length name = 0 then false
185+ else
186+ let valid = ref true in
187+ String.iter (fun c ->
188+ let code = Char.code c in
189+ (* Reject all non-ASCII and special chars *)
190+ if code < 0x21 || code > 0x7E ||
191+ c = '"' || c = '\'' || c = '>' || c = '/' || c = '=' || c = '<' then
192+ valid := false
193+ ) name;
194+ !valid
195+196+(* Sanitize element name by removing invalid characters.
197+ Returns a safe element name for serialization.
198+ Only keeps printable ASCII chars excluding special HTML chars. *)
199+let sanitize_element_name name =
200+ if is_valid_element_name name then name
201+ else begin
202+ let buf = Buffer.create (String.length name) in
203+ String.iter (fun c ->
204+ let code = Char.code c in
205+ (* Keep only printable ASCII excluding special chars *)
206+ if code >= 0x21 && code <= 0x7E &&
207+ c <> '"' && c <> '\'' && c <> '>' && c <> '/' && c <> '=' && c <> '<' then
208+ Buffer.add_char buf c
209+ ) name;
210+ let sanitized = Buffer.contents buf in
211+ if String.length sanitized = 0 then "span" else sanitized
212+ end
213+214+(* Serialize start tag - per WHATWG spec, attribute values must be quoted.
215+ Attributes with invalid names are skipped to ensure valid HTML output. *)
216let serialize_start_tag name attrs =
217 let buf = Buffer.create 64 in
218 Buffer.add_char buf '<';
219 Buffer.add_string buf name;
220 List.iter (fun (key, value) ->
221+ (* Skip attributes with invalid names - they can't be serialized safely *)
222+ if is_valid_attr_name key then begin
223+ Buffer.add_char buf ' ';
224+ Buffer.add_string buf key;
225+ if value <> "" then begin
226+ (* WHATWG serialization algorithm requires double quotes around values *)
227+ Buffer.add_char buf '=';
228+ Buffer.add_char buf '"';
229+ Buffer.add_string buf (escape_attr_value value '"');
230+ Buffer.add_char buf '"'
231+ end
232 end
233 ) attrs;
234 Buffer.add_char buf '>';
···238let serialize_end_tag name =
239 "</" ^ name ^ ">"
240241+(* Text escaping mode based on parent element *)
242+type text_mode = Normal | Raw | EscapableRaw
243+244+(* Foreign content context for tracking SVG/MathML during serialization *)
245+type foreign_ctx = NotForeign | InSvg | InMathML
246+247+(* Serialization context for tracking state during tree traversal *)
248+type serial_ctx = {
249+ mutable open_formatting: string list; (* Stack of open formatting element names *)
250+ mutable in_foreign: foreign_ctx; (* Current foreign content context *)
251+ mutable foreign_depth: int; (* Depth inside foreign content *)
252+}
253+254+let create_ctx () = {
255+ open_formatting = [];
256+ in_foreign = NotForeign;
257+ foreign_depth = 0;
258+}
259+260+(* Check if a formatting element is already open in the context *)
261+let has_open_formatting ctx name =
262+ List.mem (String.lowercase_ascii name) (List.map String.lowercase_ascii ctx.open_formatting)
263+264+(* Table elements that need implicit wrappers *)
265+let table_cell_elements = ["td"; "th"]
266+let table_row_elements = ["tr"]
267+let table_section_elements = ["tbody"; "thead"; "tfoot"]
268+269+(* Check if we need to add implicit table wrappers *)
270+let needs_tbody_wrapper parent_name children =
271+ String.lowercase_ascii parent_name = "table" &&
272+ List.exists (fun c ->
273+ let n = String.lowercase_ascii c.name in
274+ List.mem n table_row_elements || List.mem n table_cell_elements
275+ ) children
276+277+(* Check if a table has any real table content (not just comments/text that would be foster-parented) *)
278+let table_has_real_content children =
279+ List.exists (fun c ->
280+ let n = String.lowercase_ascii c.name in
281+ List.mem n table_section_elements ||
282+ List.mem n table_row_elements ||
283+ List.mem n table_cell_elements ||
284+ n = "caption" || n = "colgroup" || n = "col"
285+ ) children
286+287+(* Check if this is an empty table that would cause foster parenting instability *)
288+let is_empty_table name children =
289+ String.lowercase_ascii name = "table" && not (table_has_real_content children)
290+291+(* Structural elements that have special parsing behavior and cause instability
292+ when nested inside other elements. These should have their content output
293+ directly without the wrapper element when found in unexpected contexts. *)
294+let is_structural_element name =
295+ let name = String.lowercase_ascii name in
296+ name = "body" || name = "head" || name = "html"
297+298+(* Convert node to HTML string
299+ Returns (html_string, encountered_plaintext) where encountered_plaintext
300+ indicates that a plaintext element was found and no more content should
301+ be serialized after this point (plaintext absorbs everything after it)
302+303+ The in_foreign parameter tracks whether we're inside SVG or MathML foreign
304+ content. When in foreign content, HTML breakout elements need special handling
305+ to ensure roundtrip stability.
306+307+ The ctx parameter tracks serialization state for adoption agency handling. *)
308+let rec to_html_internal ?(pretty=true) ?(indent_size=2) ?(indent=0) ?(text_mode=Normal) ?(in_foreign=NotForeign) ?(ctx=None) node =
309+ let ctx = match ctx with Some c -> c | None -> create_ctx () in
310 let prefix = if pretty then String.make (indent * indent_size) ' ' else "" in
311 let newline = if pretty then "\n" else "" in
312313+ (* Escape text based on mode *)
314+ let escape_for_mode text = match text_mode with
315+ | Normal -> escape_text text
316+ | Raw -> text (* No escaping for script/style content *)
317+ | EscapableRaw -> escape_escapable_raw_text text
318+ in
319+320 match node.name with
321 | "#document" ->
322+ let buf = Buffer.create 256 in
323+ let first = ref true in
324+ let plaintext_found = ref false in
325+ List.iter (fun child ->
326+ if not !plaintext_found then begin
327+ let (html, pt) = to_html_internal ~pretty ~indent_size ~indent:0 ~text_mode:Normal ~in_foreign:NotForeign ~ctx:(Some ctx) child in
328+ if html <> "" then begin
329+ if not !first && pretty then Buffer.add_string buf newline;
330+ Buffer.add_string buf html;
331+ first := false
332+ end;
333+ if pt then plaintext_found := true
334+ end
335+ ) node.children;
336+ (Buffer.contents buf, !plaintext_found)
337338 | "#document-fragment" ->
339+ let buf = Buffer.create 256 in
340+ let first = ref true in
341+ let plaintext_found = ref false in
342+ List.iter (fun child ->
343+ if not !plaintext_found then begin
344+ let (html, pt) = to_html_internal ~pretty ~indent_size ~indent ~text_mode ~in_foreign ~ctx:(Some ctx) child in
345+ if html <> "" then begin
346+ if not !first && pretty then Buffer.add_string buf newline;
347+ Buffer.add_string buf html;
348+ first := false
349+ end;
350+ if pt then plaintext_found := true
351+ end
352+ ) node.children;
353+ (Buffer.contents buf, !plaintext_found)
354355 | "#text" ->
356 let text = node.data in
357+ if pretty && text_mode = Normal then
358 let trimmed = String.trim text in
359+ if trimmed = "" then ("", false)
360+ else (prefix ^ escape_for_mode trimmed, false)
361+ else (escape_for_mode text, false)
362363 | "#comment" ->
364+ (prefix ^ "<!--" ^ node.data ^ "-->", false)
365366 | "!doctype" ->
367+ (prefix ^ "<!DOCTYPE html>", false)
368369 | name ->
370+ (* Sanitize element name to ensure valid HTML output *)
371+ let name = sanitize_element_name name in
372+ let name_lower = String.lowercase_ascii name in
373+374+ (* Determine the foreign context for this element and its children.
375+ If we enter SVG or MathML, track that. If we're at an HTML integration
376+ point inside SVG, children are processed in HTML mode. *)
377+ let this_foreign = match node.namespace with
378+ | Some "svg" -> InSvg
379+ | Some "mathml" -> InMathML
380+ | _ -> in_foreign
381+ in
382+383+ (* Update foreign depth tracking *)
384+ let entering_foreign = this_foreign <> NotForeign && in_foreign = NotForeign in
385+ if entering_foreign then ctx.foreign_depth <- ctx.foreign_depth + 1;
386+387+ (* For children: if we're at an SVG HTML integration point, children go back to HTML mode *)
388+ let child_foreign =
389+ if this_foreign = InSvg && is_svg_html_integration_point name then NotForeign
390+ else this_foreign
391+ in
392+393+ (* When in foreign content, HTML breakout elements would cause the parser
394+ to exit foreign content on reparse. To ensure roundtrip stability,
395+ prefix them with 'x-' to make them custom elements. *)
396+ let name =
397+ if in_foreign <> NotForeign && is_html_breakout_element name then
398+ "x-" ^ name
399+ else
400+ name
401+ in
402+403+ (* Handle nested formatting elements for adoption agency stability.
404+ If we're about to serialize a formatting element that's already open,
405+ we need to close the outer one first and reopen it after children.
406+ This matches how the parser would reconstruct the elements. *)
407+ let is_fmt = is_formatting_element name_lower in
408+ let nested_fmt = is_fmt && has_open_formatting ctx name_lower in
409410+ (* For nested formatting elements, don't output the inner tag at all -
411+ instead, close the outer and let it reopen naturally. This produces
412+ flatter HTML that the parser will handle consistently. *)
413+ if nested_fmt then begin
414+ (* Just serialize children without this element wrapper *)
415+ let buf = Buffer.create 256 in
416+ let plaintext_found = ref false in
417+ let child_text_mode =
418+ if is_raw_text_element name then Raw
419+ else if is_escapable_raw_text_element name then EscapableRaw
420+ else Normal
421+ in
422+ List.iter (fun child ->
423+ if not !plaintext_found then begin
424+ let (html, pt) = to_html_internal ~pretty ~indent_size ~indent ~text_mode:child_text_mode ~in_foreign:child_foreign ~ctx:(Some ctx) child in
425+ if html <> "" then begin
426+ if Buffer.length buf > 0 && pretty then Buffer.add_string buf newline;
427+ Buffer.add_string buf html
428+ end;
429+ if pt then plaintext_found := true
430+ end
431+ ) node.children;
432+ if entering_foreign then ctx.foreign_depth <- ctx.foreign_depth - 1;
433+ (Buffer.contents buf, !plaintext_found)
434+ end
435+ (* Empty tables cause foster-parenting instability - skip the table tag
436+ and output children (comments/text) directly, since they would be
437+ foster-parented out of the table during reparsing anyway. *)
438+ else if is_empty_table name node.children then begin
439+ let buf = Buffer.create 256 in
440+ let plaintext_found = ref false in
441+ List.iter (fun child ->
442+ if not !plaintext_found then begin
443+ let (html, pt) = to_html_internal ~pretty ~indent_size ~indent ~text_mode ~in_foreign:child_foreign ~ctx:(Some ctx) child in
444+ if html <> "" then begin
445+ if Buffer.length buf > 0 && pretty then Buffer.add_string buf newline;
446+ Buffer.add_string buf html
447+ end;
448+ if pt then plaintext_found := true
449+ end
450+ ) node.children;
451+ if entering_foreign then ctx.foreign_depth <- ctx.foreign_depth - 1;
452+ (Buffer.contents buf, !plaintext_found)
453+ end
454+ (* Structural elements (body, head, html) nested inside other elements
455+ cause parsing instability. Skip the wrapper and output children directly. *)
456+ else if is_structural_element name && indent > 0 then begin
457+ let buf = Buffer.create 256 in
458+ let plaintext_found = ref false in
459+ List.iter (fun child ->
460+ if not !plaintext_found then begin
461+ let (html, pt) = to_html_internal ~pretty ~indent_size ~indent ~text_mode ~in_foreign:child_foreign ~ctx:(Some ctx) child in
462+ if html <> "" then begin
463+ if Buffer.length buf > 0 && pretty then Buffer.add_string buf newline;
464+ Buffer.add_string buf html
465+ end;
466+ if pt then plaintext_found := true
467+ end
468+ ) node.children;
469+ if entering_foreign then ctx.foreign_depth <- ctx.foreign_depth - 1;
470+ (Buffer.contents buf, !plaintext_found)
471+ end
472 else begin
473+ (* Track this formatting element if applicable *)
474+ if is_fmt then ctx.open_formatting <- name_lower :: ctx.open_formatting;
475+476+ let open_tag = serialize_start_tag name node.attrs in
477+478+ let result =
479+ if is_void name then
480+ (prefix ^ open_tag, false)
481+ else if is_plaintext_element name then begin
482+ (* plaintext is special: it cannot be closed once opened.
483+ We serialize content as raw text without a closing tag.
484+ Also signal that plaintext was encountered so ancestors
485+ don't add closing tags. *)
486+ let text = String.concat "" (List.map (fun c -> c.data) node.children) in
487+ (prefix ^ open_tag ^ text, true)
488+ end else if node.children = [] then
489+ (prefix ^ open_tag ^ serialize_end_tag name, false)
490+ else begin
491+ (* Determine text mode for children based on this element *)
492+ let child_text_mode =
493+ if is_raw_text_element name then Raw
494+ else if is_escapable_raw_text_element name then EscapableRaw
495+ else Normal
496+ in
497+ (* Check if all children are text *)
498+ let all_text = List.for_all is_text node.children in
499+ (* Per HTML5 spec, pre/textarea/listing need leading newline doubled *)
500+ let leading_newline =
501+ if needs_leading_newline_preserved name &&
502+ starts_with_newline (first_text_content node.children)
503+ then "\n" else ""
504+ in
505+506+ (* Add implicit tbody wrapper for tables with direct tr/td children.
507+ This prevents foster parenting on reparse. *)
508+ let children, needs_tbody =
509+ if needs_tbody_wrapper name node.children then begin
510+ (* Wrap row/cell children in tbody *)
511+ let (before, rows_and_after) = List.partition (fun c ->
512+ let n = String.lowercase_ascii c.name in
513+ n = "caption" || n = "colgroup" || n = "col"
514+ ) node.children in
515+ if rows_and_after <> [] then
516+ let tbody_node = {
517+ name = "tbody";
518+ namespace = None;
519+ data = "";
520+ attrs = [];
521+ children = rows_and_after;
522+ parent = None;
523+ doctype = None;
524+ template_content = None;
525+ location = None;
526+ } in
527+ (before @ [tbody_node], true)
528+ else
529+ (node.children, false)
530+ end else
531+ (node.children, false)
532+ in
533+ let _ = needs_tbody in (* suppress warning *)
534+535+ if all_text && not needs_tbody then begin
536+ let text = String.concat "" (List.map (fun c -> c.data) node.children) in
537+ let escaped = match child_text_mode with
538+ | Normal -> escape_text text
539+ | Raw -> text
540+ | EscapableRaw -> escape_escapable_raw_text text
541+ in
542+ (prefix ^ open_tag ^ leading_newline ^ escaped ^ serialize_end_tag name, false)
543+ end else begin
544+ let buf = Buffer.create 256 in
545+ Buffer.add_string buf (prefix ^ open_tag);
546+ Buffer.add_string buf leading_newline;
547+ let plaintext_found = ref false in
548+ List.iter (fun child ->
549+ if not !plaintext_found then begin
550+ let (html, pt) = to_html_internal ~pretty ~indent_size ~indent:(indent + 1) ~text_mode:child_text_mode ~in_foreign:child_foreign ~ctx:(Some ctx) child in
551+ if html <> "" then begin
552+ Buffer.add_string buf newline;
553+ Buffer.add_string buf html
554+ end;
555+ if pt then plaintext_found := true
556+ end
557+ ) children;
558+ (* Only add closing tag if plaintext wasn't found *)
559+ if not !plaintext_found then begin
560+ Buffer.add_string buf newline;
561+ Buffer.add_string buf (prefix ^ serialize_end_tag name)
562+ end;
563+ (Buffer.contents buf, !plaintext_found)
564+ end
565+ end
566+ in
567+568+ (* Pop formatting element from stack *)
569+ if is_fmt then
570+ ctx.open_formatting <- (match ctx.open_formatting with _ :: rest -> rest | [] -> []);
571+572+ if entering_foreign then ctx.foreign_depth <- ctx.foreign_depth - 1;
573+ result
574 end
575+576+(* Public wrapper that discards the plaintext flag *)
577+let to_html ?(pretty=true) ?(indent_size=2) ?(indent=0) ?(text_mode=Normal) node =
578+ fst (to_html_internal ~pretty ~indent_size ~indent ~text_mode node)
579580(* Get qualified name for test format *)
581let qualified_name node =
···660 if strip then String.trim combined else combined
661662(* Streaming serialization to a Bytes.Writer.t
663+ Writes HTML directly to the writer without building intermediate strings
664+ Returns true if a plaintext element was encountered (stops further serialization) *)
665+let rec to_writer_internal ?(pretty=true) ?(indent_size=2) ?(indent=0) ?(text_mode=Normal) (w : Bytes.Writer.t) node =
666 let write s = Bytes.Writer.write_string w s in
667 let write_prefix () = if pretty then write (String.make (indent * indent_size) ' ') in
668 let write_newline () = if pretty then write "\n" in
669670+ (* Escape text based on mode *)
671+ let escape_for_mode text = match text_mode with
672+ | Normal -> escape_text text
673+ | Raw -> text
674+ | EscapableRaw -> escape_escapable_raw_text text
675+ in
676+677 match node.name with
678 | "#document" ->
679+ let plaintext_found = ref false in
680+ let first = ref true in
681+ List.iter (fun child ->
682+ if not !plaintext_found then begin
683+ if not !first && pretty then write_newline ();
684+ let pt = to_writer_internal ~pretty ~indent_size ~indent:0 ~text_mode:Normal w child in
685+ first := false;
686+ if pt then plaintext_found := true
687+ end
688+ ) node.children;
689+ !plaintext_found
690691 | "#document-fragment" ->
692+ let plaintext_found = ref false in
693+ let first = ref true in
694+ List.iter (fun child ->
695+ if not !plaintext_found then begin
696+ if not !first && pretty then write_newline ();
697+ let pt = to_writer_internal ~pretty ~indent_size ~indent ~text_mode w child in
698+ first := false;
699+ if pt then plaintext_found := true
700+ end
701+ ) node.children;
702+ !plaintext_found
703704 | "#text" ->
705 let text = node.data in
706+ if pretty && text_mode = Normal then begin
707 let trimmed = String.trim text in
708 if trimmed <> "" then begin
709 write_prefix ();
710+ write (escape_for_mode trimmed)
711 end
712 end else
713+ write (escape_for_mode text);
714+ false
715716 | "#comment" ->
717 write_prefix ();
718 write "<!--";
719 write node.data;
720+ write "-->";
721+ false
722723 | "!doctype" ->
724 write_prefix ();
725+ write "<!DOCTYPE html>";
726+ false
727728 | name ->
729+ (* Sanitize element name to ensure valid HTML output *)
730+ let name = sanitize_element_name name in
731 write_prefix ();
732 write (serialize_start_tag name node.attrs);
733734+ if is_void name then
735+ false (* No end tag for void elements *)
736+ else if is_plaintext_element name then begin
737+ (* plaintext is special: cannot be closed, content is raw *)
738+ let text = String.concat "" (List.map (fun c -> c.data) node.children) in
739+ write text;
740+ (* No closing tag for plaintext, signal to stop further serialization *)
741+ true
742+ end else if node.children = [] then begin
743+ write (serialize_end_tag name);
744+ false
745+ end else begin
746+ (* Determine text mode for children based on this element *)
747+ let child_text_mode =
748+ if is_raw_text_element name then Raw
749+ else if is_escapable_raw_text_element name then EscapableRaw
750+ else Normal
751+ in
752+ (* Check if all children are text *)
753+ let all_text = List.for_all is_text node.children in
754+ (* Per HTML5 spec, pre/textarea/listing need leading newline doubled *)
755+ let needs_leading_nl =
756+ needs_leading_newline_preserved name &&
757+ starts_with_newline (first_text_content node.children)
758+ in
759+ if all_text then begin
760+ let text = String.concat "" (List.map (fun c -> c.data) node.children) in
761+ let escaped = match child_text_mode with
762+ | Normal -> escape_text text
763+ | Raw -> text
764+ | EscapableRaw -> escape_escapable_raw_text text
765+ in
766+ if needs_leading_nl then write "\n";
767+ write escaped;
768+ write (serialize_end_tag name);
769+ false
770+ end else begin
771+ if needs_leading_nl then write "\n";
772+ let plaintext_found = ref false in
773+ List.iter (fun child ->
774+ if not !plaintext_found then begin
775+ write_newline ();
776+ let pt = to_writer_internal ~pretty ~indent_size ~indent:(indent + 1) ~text_mode:child_text_mode w child in
777+ if pt then plaintext_found := true
778+ end
779+ ) node.children;
780+ (* Only add closing tag if plaintext wasn't found *)
781+ if not !plaintext_found then begin
782 write_newline ();
783 write_prefix ();
784 write (serialize_end_tag name)
785+ end;
786+ !plaintext_found
787 end
788 end
789+790+(* Public wrapper that discards the plaintext flag *)
791+let to_writer ?(pretty=true) ?(indent_size=2) ?(indent=0) (w : Bytes.Writer.t) node =
792+ ignore (to_writer_internal ~pretty ~indent_size ~indent w node)
+3
lib/html5rw/parser/parser_tree_builder.ml
···208 end
209210let insert_element t name ?(namespace=None) ?(push=false) attrs =
000211 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in
212 let node = Dom.create_element name ~namespace ~attrs ~location () in
213 let (parent, before) = appropriate_insertion_place t in
···208 end
209210let insert_element t name ?(namespace=None) ?(push=false) attrs =
211+ (* Reset ignore_lf flag - per HTML5 spec, only the immediately next token
212+ after pre/textarea/listing should be checked for leading LF *)
213+ t.ignore_lf <- false;
214 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in
215 let node = Dom.create_element name ~namespace ~attrs ~location () in
216 let (parent, before) = appropriate_insertion_place t in
+13-1
lib/html5rw/tokenizer/tokenizer_impl.ml
···711 t.state <- Tokenizer_state.Bogus_comment
712713 and state_tag_name () =
714- match Tokenizer_stream.consume t.stream with
715 | Some ('\t' | '\n' | '\x0C' | ' ') ->
0716 t.state <- Tokenizer_state.Before_attribute_name
717 | Some '/' ->
0718 t.state <- Tokenizer_state.Self_closing_start_tag
719 | Some '>' ->
0720 t.state <- Tokenizer_state.Data;
721 emit_current_tag ()
722 | Some '\x00' ->
0723 error t "unexpected-null-character";
724 Buffer.add_string t.current_tag_name "\xEF\xBF\xBD"
0000000725 | Some c ->
0726 check_control_char c;
727 Buffer.add_char t.current_tag_name (ascii_lower c)
728 | None -> ()
···711 t.state <- Tokenizer_state.Bogus_comment
712713 and state_tag_name () =
714+ match Tokenizer_stream.peek t.stream with
715 | Some ('\t' | '\n' | '\x0C' | ' ') ->
716+ Tokenizer_stream.advance t.stream;
717 t.state <- Tokenizer_state.Before_attribute_name
718 | Some '/' ->
719+ Tokenizer_stream.advance t.stream;
720 t.state <- Tokenizer_state.Self_closing_start_tag
721 | Some '>' ->
722+ Tokenizer_stream.advance t.stream;
723 t.state <- Tokenizer_state.Data;
724 emit_current_tag ()
725 | Some '\x00' ->
726+ Tokenizer_stream.advance t.stream;
727 error t "unexpected-null-character";
728 Buffer.add_string t.current_tag_name "\xEF\xBF\xBD"
729+ | Some '<' ->
730+ (* Per HTML5 spec section 13.2.5.8: '<' is "anything else" - append to tag name.
731+ Note: The previous implementation incorrectly emitted the tag and switched
732+ to tag open state. The spec says to just append the character to the tag name
733+ without emitting an error. *)
734+ Tokenizer_stream.advance t.stream;
735+ Buffer.add_char t.current_tag_name '<'
736 | Some c ->
737+ Tokenizer_stream.advance t.stream;
738 check_control_char c;
739 Buffer.add_char t.current_tag_name (ascii_lower c)
740 | None -> ()