OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* Test runner for html5lib-tests serializer tests *)
2
3module Dom = Html5rw.Dom
4module Report = Test_report
5
6(* Extract values from JSON *)
7let json_string = function
8 | Jsont.String (s, _) -> s
9 | _ -> failwith "Expected string"
10
11let json_string_opt = function
12 | Jsont.Null _ -> None
13 | Jsont.String (s, _) -> Some s
14 | _ -> failwith "Expected string or null"
15
16let json_bool = function
17 | Jsont.Bool (b, _) -> b
18 | _ -> failwith "Expected bool"
19
20let json_array = function
21 | Jsont.Array (arr, _) -> arr
22 | _ -> failwith "Expected array"
23
24let json_object = function
25 | Jsont.Object (obj, _) -> obj
26 | _ -> failwith "Expected object"
27
28let json_mem name obj =
29 match List.find_opt (fun ((n, _), _) -> n = name) obj with
30 | Some (_, v) -> Some v
31 | None -> None
32
33let json_mem_exn name obj =
34 match json_mem name obj with
35 | Some v -> v
36 | None -> failwith ("Missing member: " ^ name)
37
38let rec json_to_string = function
39 | Jsont.Null _ -> "null"
40 | Jsont.Bool (b, _) -> string_of_bool b
41 | Jsont.Number (n, _) -> Printf.sprintf "%g" n
42 | Jsont.String (s, _) -> Printf.sprintf "%S" s
43 | Jsont.Array (arr, _) ->
44 "[" ^ String.concat ", " (List.map json_to_string arr) ^ "]"
45 | Jsont.Object (obj, _) ->
46 "{" ^ String.concat ", " (List.map (fun ((n, _), v) -> Printf.sprintf "%S: %s" n (json_to_string v)) obj) ^ "}"
47
48(* Serialization options *)
49type serialize_options = {
50 quote_char : char;
51 quote_char_explicit : bool; (* Was quote_char explicitly set? *)
52 minimize_boolean_attributes : bool;
53 use_trailing_solidus : bool;
54 escape_lt_in_attrs : bool;
55 escape_rcdata : bool;
56 strip_whitespace : bool;
57 inject_meta_charset : bool;
58 encoding : string option;
59 omit_optional_tags : bool;
60}
61
62let default_options = {
63 quote_char = '"';
64 quote_char_explicit = false;
65 minimize_boolean_attributes = true;
66 use_trailing_solidus = false;
67 escape_lt_in_attrs = false;
68 escape_rcdata = false;
69 strip_whitespace = false;
70 inject_meta_charset = false;
71 encoding = None;
72 omit_optional_tags = true; (* HTML5 default *)
73}
74
75(* Parse options from JSON *)
76let parse_options json_opt =
77 match json_opt with
78 | None -> default_options
79 | Some json ->
80 let obj = json_object json in
81 let get_bool name default =
82 match json_mem name obj with
83 | Some j -> (try json_bool j with _ -> default)
84 | None -> default
85 in
86 let get_string name =
87 match json_mem name obj with
88 | Some (Jsont.String (s, _)) -> Some s
89 | _ -> None
90 in
91 let quote_char_opt =
92 match json_mem "quote_char" obj with
93 | Some (Jsont.String (s, _)) when String.length s = 1 -> Some s.[0]
94 | _ -> None
95 in
96 {
97 quote_char = Option.value ~default:'"' quote_char_opt;
98 quote_char_explicit = Option.is_some quote_char_opt;
99 minimize_boolean_attributes = get_bool "minimize_boolean_attributes" (get_bool "quote_attr_values" true);
100 use_trailing_solidus = get_bool "use_trailing_solidus" false;
101 escape_lt_in_attrs = get_bool "escape_lt_in_attrs" false;
102 escape_rcdata = get_bool "escape_rcdata" false;
103 strip_whitespace = get_bool "strip_whitespace" false;
104 inject_meta_charset = get_bool "inject_meta_charset" false;
105 encoding = get_string "encoding";
106 omit_optional_tags = get_bool "omit_optional_tags" true;
107 }
108
109(* Test case *)
110type test_case = {
111 description : string;
112 input : Jsont.json list;
113 expected : string list;
114 options : serialize_options;
115 raw_json : string; (* Original JSON representation of this test *)
116}
117
118let parse_test_case json =
119 let raw_json = json_to_string json in
120 let obj = json_object json in
121 let description = json_string (json_mem_exn "description" obj) in
122 let input = json_array (json_mem_exn "input" obj) in
123 let expected = List.map json_string (json_array (json_mem_exn "expected" obj)) in
124 let options = parse_options (json_mem "options" obj) in
125 { description; input; expected; options; raw_json }
126
127(* Parse attrs that can be either array [{name, value}] or object {name: value} or empty {} *)
128let parse_attrs attrs_json =
129 match attrs_json with
130 | Jsont.Array (arr, _) ->
131 List.map (fun attr_json ->
132 let attr_obj = json_object attr_json in
133 let attr_name = json_string (json_mem_exn "name" attr_obj) in
134 let value = json_string (json_mem_exn "value" attr_obj) in
135 (attr_name, value)
136 ) arr
137 | Jsont.Object (obj, _) ->
138 List.map (fun ((n, _), v) -> (n, json_string v)) obj
139 | _ -> []
140
141(* Void elements that don't need end tags *)
142let is_void_element name =
143 List.mem (String.lowercase_ascii name)
144 ["area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input";
145 "link"; "meta"; "param"; "source"; "track"; "wbr"]
146
147(* Raw text elements whose content should not be escaped *)
148let is_raw_text_element name =
149 List.mem (String.lowercase_ascii name) ["script"; "style"]
150
151(* Elements where whitespace should be preserved *)
152let is_whitespace_preserving_element name =
153 List.mem (String.lowercase_ascii name) ["pre"; "textarea"; "script"; "style"]
154
155(* Block elements that close a p tag *)
156let p_closing_elements = [
157 "address"; "article"; "aside"; "blockquote"; "datagrid"; "dialog"; "dir";
158 "div"; "dl"; "fieldset"; "footer"; "form"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6";
159 "header"; "hgroup"; "hr"; "main"; "menu"; "nav"; "ol"; "p"; "pre"; "section";
160 "table"; "ul"
161]
162
163let is_p_closing_element name =
164 List.mem (String.lowercase_ascii name) p_closing_elements
165
166(* Collapse runs of whitespace to single space *)
167let collapse_whitespace text =
168 let len = String.length text in
169 let buf = Buffer.create len in
170 let in_whitespace = ref false in
171 for i = 0 to len - 1 do
172 let c = text.[i] in
173 if c = '\t' || c = '\r' || c = '\n' || c = '\x0C' || c = ' ' then begin
174 if not !in_whitespace then begin
175 Buffer.add_char buf ' ';
176 in_whitespace := true
177 end
178 end else begin
179 Buffer.add_char buf c;
180 in_whitespace := false
181 end
182 done;
183 Buffer.contents buf
184
185(* Token types for stream-based serialization *)
186type token_type =
187 | StartTag of string * (string * string) list (* name, attrs *)
188 | EndTag of string (* name *)
189 | EmptyTag of string * (string * string) list (* name, attrs *)
190 | TextNode of string
191 | CommentNode of string
192 | DoctypeNode of Html5rw.Dom.node
193
194type token_info = {
195 token : token_type option;
196}
197
198let build_token_info token =
199 let arr = json_array token in
200 match arr with
201 | [] -> { token = None }
202 | type_json :: rest ->
203 let token_type_str = json_string type_json in
204 match token_type_str, rest with
205 | "StartTag", [_ns_json; name_json; attrs_json] ->
206 let name = json_string name_json in
207 let attrs = parse_attrs attrs_json in
208 { token = Some (StartTag (name, attrs)) }
209
210 | "StartTag", [name_json; attrs_json] ->
211 let name = json_string name_json in
212 let attrs = parse_attrs attrs_json in
213 { token = Some (StartTag (name, attrs)) }
214
215 | "EmptyTag", [name_json; attrs_json] ->
216 let name = json_string name_json in
217 let attrs = parse_attrs attrs_json in
218 { token = Some (EmptyTag (name, attrs)) }
219
220 | "EndTag", [_ns_json; name_json] ->
221 let name = json_string name_json in
222 { token = Some (EndTag name) }
223
224 | "EndTag", [name_json] ->
225 let name = json_string name_json in
226 { token = Some (EndTag name) }
227
228 | "Characters", [text_json] ->
229 let text = json_string text_json in
230 { token = Some (TextNode text) }
231
232 | "Comment", [text_json] ->
233 let text = json_string text_json in
234 { token = Some (CommentNode text) }
235
236 | "Doctype", [name_json] ->
237 let name = json_string name_json in
238 let node = Html5rw.Dom.create_doctype ~name () in
239 { token = Some (DoctypeNode node) }
240
241 | "Doctype", [name_json; public_json] ->
242 let name = json_string name_json in
243 let public_id = json_string_opt public_json in
244 let node = match public_id with
245 | Some pub -> Html5rw.Dom.create_doctype ~name ~public_id:pub ()
246 | None -> Html5rw.Dom.create_doctype ~name ()
247 in
248 { token = Some (DoctypeNode node) }
249
250 | "Doctype", [name_json; public_json; system_json] ->
251 let name = json_string name_json in
252 let public_id = json_string_opt public_json in
253 let system_id = json_string_opt system_json in
254 let node = match public_id, system_id with
255 | Some pub, Some sys -> Html5rw.Dom.create_doctype ~name ~public_id:pub ~system_id:sys ()
256 | Some pub, None -> Html5rw.Dom.create_doctype ~name ~public_id:pub ()
257 | None, Some sys -> Html5rw.Dom.create_doctype ~name ~system_id:sys ()
258 | None, None -> Html5rw.Dom.create_doctype ~name ()
259 in
260 { token = Some (DoctypeNode node) }
261
262 | _ -> { token = None }
263
264(* Serialize a single node to HTML with options *)
265let escape_text text =
266 let buf = Buffer.create (String.length text) in
267 String.iter (fun c ->
268 match c with
269 | '&' -> Buffer.add_string buf "&"
270 | '<' -> Buffer.add_string buf "<"
271 | '>' -> Buffer.add_string buf ">"
272 | c -> Buffer.add_char buf c
273 ) text;
274 Buffer.contents buf
275
276let can_unquote_attr_value value =
277 if String.length value = 0 then false
278 else
279 let valid = ref true in
280 String.iter (fun c ->
281 if c = '>' || c = '"' || c = '\'' || c = '=' || c = '`' ||
282 c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' then
283 valid := false
284 ) value;
285 !valid
286
287let escape_attr_value value quote_char escape_lt =
288 let buf = Buffer.create (String.length value) in
289 String.iter (fun c ->
290 match c with
291 | '&' -> Buffer.add_string buf "&"
292 | '"' when quote_char = '"' -> Buffer.add_string buf """
293 | '\'' when quote_char = '\'' -> Buffer.add_string buf "'"
294 | '<' when escape_lt -> Buffer.add_string buf "<"
295 | c -> Buffer.add_char buf c
296 ) value;
297 Buffer.contents buf
298
299let serialize_node opts ~in_raw_text node =
300 match node.Dom.name with
301 | "#text" ->
302 if in_raw_text && not opts.escape_rcdata then
303 node.Dom.data
304 else
305 escape_text node.Dom.data
306 | "#comment" ->
307 "<!--" ^ node.Dom.data ^ "-->"
308 | "!doctype" ->
309 let buf = Buffer.create 64 in
310 Buffer.add_string buf "<!DOCTYPE ";
311 (match node.Dom.doctype with
312 | Some dt ->
313 Buffer.add_string buf (Option.value ~default:"html" dt.Dom.name);
314 (match dt.Dom.public_id with
315 | Some pub when pub <> "" ->
316 Buffer.add_string buf " PUBLIC \"";
317 Buffer.add_string buf pub;
318 Buffer.add_char buf '"';
319 (match dt.Dom.system_id with
320 | Some sys ->
321 Buffer.add_string buf " \"";
322 Buffer.add_string buf sys;
323 Buffer.add_char buf '"'
324 | None -> ())
325 | _ ->
326 match dt.Dom.system_id with
327 | Some sys when sys <> "" ->
328 Buffer.add_string buf " SYSTEM \"";
329 Buffer.add_string buf sys;
330 Buffer.add_char buf '"'
331 | _ -> ())
332 | None -> Buffer.add_string buf "html");
333 Buffer.add_char buf '>';
334 Buffer.contents buf
335 | _ -> failwith "serialize_node called with element"
336
337let choose_quote value default_quote explicit =
338 (* If quote_char was explicitly set, always use it *)
339 if explicit then default_quote
340 else
341 (* Otherwise, if value contains the default quote but not the other, use the other *)
342 let has_double = String.contains value '"' in
343 let has_single = String.contains value '\'' in
344 if has_double && not has_single then '\''
345 else if has_single && not has_double then '"'
346 else default_quote
347
348(* Serialize an element tag (start tag) *)
349let serialize_start_tag opts ~is_empty_tag name attrs =
350 let buf = Buffer.create 64 in
351 Buffer.add_char buf '<';
352 Buffer.add_string buf name;
353 (* Sort attributes alphabetically for consistent output *)
354 let sorted_attrs = List.sort (fun (a, _) (b, _) -> String.compare a b) attrs in
355 List.iter (fun (key, value) ->
356 Buffer.add_char buf ' ';
357 Buffer.add_string buf key;
358 let should_minimize =
359 opts.minimize_boolean_attributes &&
360 String.lowercase_ascii key = String.lowercase_ascii value
361 in
362 if should_minimize then
363 ()
364 else if String.length value = 0 then begin
365 Buffer.add_char buf '=';
366 Buffer.add_char buf opts.quote_char;
367 Buffer.add_char buf opts.quote_char
368 end else if can_unquote_attr_value value then begin
369 Buffer.add_char buf '=';
370 Buffer.add_string buf value
371 end else begin
372 let quote = choose_quote value opts.quote_char opts.quote_char_explicit in
373 Buffer.add_char buf '=';
374 Buffer.add_char buf quote;
375 Buffer.add_string buf (escape_attr_value value quote opts.escape_lt_in_attrs);
376 Buffer.add_char buf quote
377 end
378 ) sorted_attrs;
379 if opts.use_trailing_solidus && (is_empty_tag || is_void_element name) then
380 Buffer.add_string buf " /";
381 Buffer.add_char buf '>';
382 Buffer.contents buf
383
384(* Check if text starts with ASCII whitespace *)
385let text_starts_with_space text =
386 String.length text > 0 &&
387 let c = text.[0] in
388 c = '\t' || c = '\n' || c = '\x0C' || c = '\r' || c = ' '
389
390(* Optional tag omission helpers *)
391type next_token =
392 | NTComment
393 | NTSpace (* Text starting with space *)
394 | NTText (* Text not starting with space *)
395 | NTStartTag of string
396 | NTEmptyTag of string
397 | NTEndTag of string
398 | NTEOF
399
400let classify_next tokens idx =
401 if idx >= Array.length tokens then NTEOF
402 else match tokens.(idx).token with
403 | None -> NTEOF
404 | Some (CommentNode _) -> NTComment
405 | Some (TextNode text) ->
406 if text_starts_with_space text then NTSpace else NTText
407 | Some (StartTag (name, _)) -> NTStartTag (String.lowercase_ascii name)
408 | Some (EmptyTag (name, _)) -> NTEmptyTag (String.lowercase_ascii name)
409 | Some (EndTag name) -> NTEndTag (String.lowercase_ascii name)
410 | Some (DoctypeNode _) -> NTEOF (* Treat doctype as if nothing follows *)
411
412(* Should we omit a start tag? *)
413let should_omit_start_tag opts name attrs next =
414 if not opts.omit_optional_tags then false
415 else
416 let name = String.lowercase_ascii name in
417 match name, next with
418 (* html start tag: omit if not followed by comment or space, AND has no attributes *)
419 | "html", NTComment -> false
420 | "html", NTSpace -> false
421 | "html", _ -> attrs = [] (* only omit if no attributes *)
422 (* head start tag: omit if followed by element (start/empty tag) *)
423 | "head", NTStartTag _ -> true
424 | "head", NTEmptyTag _ -> true
425 | "head", NTEndTag "head" -> true (* empty head *)
426 | "head", NTEOF -> true
427 | "head", _ -> false
428 (* body start tag: omit if not followed by comment or space, AND has no attributes *)
429 | "body", NTComment -> false
430 | "body", NTSpace -> false
431 | "body", _ -> attrs = [] (* only omit if no attributes *)
432 (* colgroup start tag: omit if followed by col element *)
433 | "colgroup", NTStartTag "col" -> true
434 | "colgroup", NTEmptyTag "col" -> true
435 | "colgroup", _ -> false
436 (* tbody start tag: omit if first child is tr *)
437 | "tbody", NTStartTag "tr" -> true
438 | "tbody", _ -> false
439 | _ -> false
440
441(* Should we omit an end tag? *)
442let should_omit_end_tag opts name next =
443 if not opts.omit_optional_tags then false
444 else
445 let name = String.lowercase_ascii name in
446 match name, next with
447 (* html end tag: omit if not followed by comment or space *)
448 | "html", NTComment -> false
449 | "html", NTSpace -> false
450 | "html", _ -> true
451 (* head end tag: omit if not followed by comment or space *)
452 | "head", NTComment -> false
453 | "head", NTSpace -> false
454 | "head", _ -> true
455 (* body end tag: omit if not followed by comment or space *)
456 | "body", NTComment -> false
457 | "body", NTSpace -> false
458 | "body", _ -> true
459 (* li end tag: omit if followed by li start tag or parent end tag *)
460 | "li", NTStartTag "li" -> true
461 | "li", NTEndTag _ -> true
462 | "li", NTEOF -> true
463 | "li", _ -> false
464 (* dt end tag: omit if followed by dt or dd start tag (NOT end tag or EOF!) *)
465 | "dt", NTStartTag "dt" -> true
466 | "dt", NTStartTag "dd" -> true
467 | "dt", _ -> false
468 (* dd end tag: omit if followed by dd or dt start tag, or end tag, or EOF *)
469 | "dd", NTStartTag "dd" -> true
470 | "dd", NTStartTag "dt" -> true
471 | "dd", NTEndTag _ -> true
472 | "dd", NTEOF -> true
473 | "dd", _ -> false
474 (* p end tag: omit if followed by block element (start or empty tag), end tag, or EOF *)
475 | "p", NTStartTag next_name when is_p_closing_element next_name -> true
476 | "p", NTEmptyTag next_name when is_p_closing_element next_name -> true
477 | "p", NTEndTag _ -> true
478 | "p", NTEOF -> true
479 | "p", _ -> false
480 (* optgroup end tag: omit if followed by optgroup start tag, end tag, or EOF *)
481 | "optgroup", NTStartTag "optgroup" -> true
482 | "optgroup", NTEndTag _ -> true
483 | "optgroup", NTEOF -> true
484 | "optgroup", _ -> false
485 (* option end tag: omit if followed by option/optgroup start tag, end tag, or EOF *)
486 | "option", NTStartTag "option" -> true
487 | "option", NTStartTag "optgroup" -> true
488 | "option", NTEndTag _ -> true
489 | "option", NTEOF -> true
490 | "option", _ -> false
491 (* colgroup end tag: omit if not followed by comment, space, or another colgroup *)
492 | "colgroup", NTComment -> false
493 | "colgroup", NTSpace -> false
494 | "colgroup", NTStartTag "colgroup" -> false (* keep end tag when another colgroup follows *)
495 | "colgroup", _ -> true
496 (* thead end tag: omit if followed by tbody or tfoot start tag *)
497 | "thead", NTStartTag "tbody" -> true
498 | "thead", NTStartTag "tfoot" -> true
499 | "thead", _ -> false
500 (* tbody end tag: omit if followed by tbody/tfoot start tag, end tag, or EOF *)
501 | "tbody", NTStartTag "tbody" -> true
502 | "tbody", NTStartTag "tfoot" -> true
503 | "tbody", NTEndTag _ -> true
504 | "tbody", NTEOF -> true
505 | "tbody", _ -> false
506 (* tfoot end tag: omit if followed by tbody start tag, end tag, or EOF *)
507 | "tfoot", NTStartTag "tbody" -> true
508 | "tfoot", NTEndTag _ -> true
509 | "tfoot", NTEOF -> true
510 | "tfoot", _ -> false
511 (* tr end tag: omit if followed by tr start tag, end tag, or EOF *)
512 | "tr", NTStartTag "tr" -> true
513 | "tr", NTEndTag _ -> true
514 | "tr", NTEOF -> true
515 | "tr", _ -> false
516 (* td end tag: omit if followed by td/th start tag, end tag, or EOF *)
517 | "td", NTStartTag "td" -> true
518 | "td", NTStartTag "th" -> true
519 | "td", NTEndTag _ -> true
520 | "td", NTEOF -> true
521 | "td", _ -> false
522 (* th end tag: omit if followed by th/td start tag, end tag, or EOF *)
523 | "th", NTStartTag "th" -> true
524 | "th", NTStartTag "td" -> true
525 | "th", NTEndTag _ -> true
526 | "th", NTEOF -> true
527 | "th", _ -> false
528 | _ -> false
529
530(* Run a single test *)
531let run_test test =
532 try
533 (* Build token infos from input *)
534 let token_infos = Array.of_list (List.map build_token_info test.input) in
535 let num_tokens = Array.length token_infos in
536
537 (* Handle inject_meta_charset option *)
538 let inject_meta = test.options.inject_meta_charset in
539 let encoding = test.options.encoding in
540
541 (* Serialize with context tracking *)
542 let buf = Buffer.create 256 in
543 let in_raw_text = ref false in
544 let preserve_whitespace = ref false in
545 let element_stack : string list ref = ref [] in
546 let in_head = ref false in
547 let meta_charset_injected = ref false in
548 let prev_was_section_end = ref false in (* Track if prev token was thead/tbody/tfoot end *)
549
550 for i = 0 to num_tokens - 1 do
551 let info = token_infos.(i) in
552 let next = classify_next token_infos (i + 1) in
553
554 match info.token with
555 | None -> ()
556
557 | Some (StartTag (name, attrs)) ->
558 let name_lower = String.lowercase_ascii name in
559
560 (* Track head element *)
561 if name_lower = "head" then in_head := true;
562
563 (* For inject_meta_charset, we need to check if there's any charset meta coming up *)
564 (* If yes, don't inject at head start; if no, inject at head start *)
565 let should_inject_at_head =
566 if not inject_meta || name_lower <> "head" then false
567 else match encoding with
568 | None -> false
569 | Some _ ->
570 (* Look ahead to see if there's a charset meta or http-equiv content-type *)
571 let has_charset_meta = ref false in
572 for j = i + 1 to num_tokens - 1 do
573 match token_infos.(j).token with
574 | Some (EmptyTag (n, a)) when String.lowercase_ascii n = "meta" ->
575 let has_charset = List.exists (fun (k, _) -> String.lowercase_ascii k = "charset") a in
576 let has_http_equiv_ct = List.exists (fun (k, v) ->
577 String.lowercase_ascii k = "http-equiv" && String.lowercase_ascii v = "content-type") a in
578 if has_charset || has_http_equiv_ct then has_charset_meta := true
579 | Some (EndTag n) when String.lowercase_ascii n = "head" -> ()
580 | _ -> ()
581 done;
582 not !has_charset_meta
583 in
584
585 (* Special case: tbody start tag cannot be omitted if preceded by section end tag *)
586 let can_omit_start =
587 if name_lower = "tbody" && !prev_was_section_end then false
588 else should_omit_start_tag test.options name attrs next
589 in
590 prev_was_section_end := false; (* Reset for next iteration *)
591
592 if should_inject_at_head then begin
593 meta_charset_injected := true;
594 (* Don't output head start tag if we should omit it *)
595 if not can_omit_start then
596 Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:false name attrs);
597 Buffer.add_string buf (Printf.sprintf "<meta charset=%s>" (Option.get encoding));
598 element_stack := name :: !element_stack;
599 if is_raw_text_element name then in_raw_text := true;
600 if is_whitespace_preserving_element name then preserve_whitespace := true
601 end else if not can_omit_start then begin
602 Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:false name attrs);
603 element_stack := name :: !element_stack;
604 if is_raw_text_element name then in_raw_text := true;
605 if is_whitespace_preserving_element name then preserve_whitespace := true
606 end else begin
607 element_stack := name :: !element_stack;
608 if is_raw_text_element name then in_raw_text := true;
609 if is_whitespace_preserving_element name then preserve_whitespace := true
610 end
611
612 | Some (EmptyTag (name, attrs)) ->
613 let name_lower = String.lowercase_ascii name in
614 prev_was_section_end := false; (* Reset for next iteration *)
615
616 (* Handle meta charset replacement *)
617 if inject_meta && !in_head && name_lower = "meta" then begin
618 let has_charset = List.exists (fun (k, _) -> String.lowercase_ascii k = "charset") attrs in
619 let has_http_equiv_ct =
620 List.exists (fun (k, v) ->
621 String.lowercase_ascii k = "http-equiv" &&
622 String.lowercase_ascii v = "content-type"
623 ) attrs
624 in
625 if has_charset then begin
626 (* Replace charset value *)
627 match encoding with
628 | Some enc ->
629 Buffer.add_string buf (Printf.sprintf "<meta charset=%s>" enc)
630 | None -> () (* No encoding, skip the meta tag *)
631 end else if has_http_equiv_ct then begin
632 (* Replace charset in content value *)
633 match encoding with
634 | Some enc ->
635 let new_attrs = List.map (fun (k, v) ->
636 if String.lowercase_ascii k = "content" then
637 let new_content = Printf.sprintf "text/html; charset=%s" enc in
638 (k, new_content)
639 else (k, v)
640 ) attrs in
641 Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name new_attrs)
642 | None ->
643 Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name attrs)
644 end else begin
645 (* Regular meta tag, output as normal *)
646 Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name attrs)
647 end
648 end else
649 Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name attrs)
650
651 | Some (EndTag name) ->
652 let name_lower = String.lowercase_ascii name in
653
654 (* Track head element *)
655 if name_lower = "head" then in_head := false;
656
657 (* Pop from element stack *)
658 (match !element_stack with
659 | top :: rest when String.lowercase_ascii top = name_lower ->
660 element_stack := rest;
661 if is_raw_text_element name then in_raw_text := false;
662 if is_whitespace_preserving_element name then preserve_whitespace := false
663 | _ -> ());
664
665 let is_section_end = List.mem name_lower ["thead"; "tbody"; "tfoot"] in
666 let omit = should_omit_end_tag test.options name next in
667
668 if not omit then begin
669 Buffer.add_string buf "</";
670 Buffer.add_string buf name;
671 Buffer.add_char buf '>'
672 end;
673
674 (* Track if we omitted a section end tag - next tbody can't be omitted *)
675 prev_was_section_end := is_section_end && omit
676
677 | Some (TextNode text) ->
678 prev_was_section_end := false;
679 let processed_text =
680 if !in_raw_text && not test.options.escape_rcdata then
681 text
682 else if test.options.strip_whitespace && not !preserve_whitespace then
683 escape_text (collapse_whitespace text)
684 else
685 escape_text text
686 in
687 Buffer.add_string buf processed_text
688
689 | Some (CommentNode text) ->
690 prev_was_section_end := false;
691 Buffer.add_string buf "<!--";
692 Buffer.add_string buf text;
693 Buffer.add_string buf "-->"
694
695 | Some (DoctypeNode node) ->
696 prev_was_section_end := false;
697 Buffer.add_string buf (serialize_node test.options ~in_raw_text:false node)
698 done;
699
700 let serialized = Buffer.contents buf in
701
702 (* Check if it matches any expected output *)
703 let matches = List.exists (fun exp -> serialized = exp) test.expected in
704
705 (matches, serialized, test.expected)
706 with e ->
707 (false, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), test.expected)
708
709(* Run all tests in a file *)
710let format_options opts =
711 let parts = [] in
712 let parts = if opts.quote_char_explicit then
713 Printf.sprintf "quote_char='%c'" opts.quote_char :: parts else parts in
714 let parts = if not opts.minimize_boolean_attributes then
715 "minimize_bool=false" :: parts else parts in
716 let parts = if opts.use_trailing_solidus then
717 "trailing_solidus=true" :: parts else parts in
718 let parts = if opts.escape_lt_in_attrs then
719 "escape_lt=true" :: parts else parts in
720 let parts = if opts.escape_rcdata then
721 "escape_rcdata=true" :: parts else parts in
722 let parts = if opts.strip_whitespace then
723 "strip_ws=true" :: parts else parts in
724 let parts = if opts.inject_meta_charset then
725 "inject_charset=true" :: parts else parts in
726 let parts = if not opts.omit_optional_tags then
727 "omit_tags=false" :: parts else parts in
728 if parts = [] then "(defaults)" else String.concat ", " (List.rev parts)
729
730let run_file path =
731 let content =
732 let ic = open_in path in
733 let n = in_channel_length ic in
734 let s = really_input_string ic n in
735 close_in ic;
736 s
737 in
738
739 let json = match Jsont_bytesrw.decode_string Jsont.json content with
740 | Ok j -> j
741 | Error e -> failwith (Printf.sprintf "JSON parse error in %s: %s" path e)
742 in
743
744 let obj = json_object json in
745 let tests_json = match json_mem "tests" obj with
746 | Some t -> json_array t
747 | None -> []
748 in
749
750 let filename = Filename.basename path in
751 let passed = ref 0 in
752 let failed = ref 0 in
753 let results = ref [] in
754
755 List.iteri (fun i test_json ->
756 try
757 let test = parse_test_case test_json in
758 let (success, actual, expected) = run_test test in
759
760 let result : Report.test_result = {
761 test_num = i + 1;
762 description = test.description;
763 input = String.concat "\n" (List.map (fun tok ->
764 (* Simplified token representation *)
765 match tok with
766 | Jsont.Array (arr, _) ->
767 (match arr with
768 | Jsont.String (ty, _) :: rest ->
769 Printf.sprintf "%s: %s" ty (String.concat ", " (List.map (function
770 | Jsont.String (s, _) -> Printf.sprintf "%S" s
771 | Jsont.Object _ -> "{...}"
772 | Jsont.Null _ -> "null"
773 | _ -> "?"
774 ) rest))
775 | _ -> "?")
776 | _ -> "?"
777 ) test.input);
778 expected = String.concat " | " expected;
779 actual;
780 success;
781 details = [
782 ("Options", format_options test.options);
783 ("Expected Variants", string_of_int (List.length expected));
784 ];
785 raw_test_data = Some test.raw_json;
786 } in
787 results := result :: !results;
788
789 if success then incr passed else incr failed
790 with e ->
791 incr failed;
792 let result : Report.test_result = {
793 test_num = i + 1;
794 description = Printf.sprintf "Test %d" (i + 1);
795 input = "";
796 expected = "";
797 actual = Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e);
798 success = false;
799 details = [];
800 raw_test_data = Some (json_to_string test_json);
801 } in
802 results := result :: !results;
803 Printf.eprintf "Exception parsing test %d: %s\n" (i + 1) (Printexc.to_string e)
804 ) tests_json;
805
806 let file_result : Report.file_result = {
807 filename;
808 test_type = "Serializer";
809 passed_count = !passed;
810 failed_count = !failed;
811 tests = List.rev !results;
812 } in
813 (file_result, !passed, !failed)
814
815let () =
816 let test_dir = Sys.argv.(1) in
817 let files = Sys.readdir test_dir |> Array.to_list in
818 let test_files = List.filter (fun f -> Filename.check_suffix f ".test") files in
819
820 let total_passed = ref 0 in
821 let total_failed = ref 0 in
822 let file_results = ref [] in
823
824 List.iter (fun file ->
825 let path = Filename.concat test_dir file in
826 let (file_result, passed, failed) = run_file path in
827 total_passed := !total_passed + passed;
828 total_failed := !total_failed + failed;
829 file_results := file_result :: !file_results;
830 Printf.printf "%s: %d passed, %d failed\n" file passed failed
831 ) (List.sort String.compare test_files);
832
833 Printf.printf "\n=== Summary ===\n";
834 Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed;
835
836 (* Generate HTML report *)
837 let report : Report.report = {
838 title = "HTML5 Serializer Tests";
839 test_type = "serializer";
840 description = "These tests validate the HTML serialization algorithm for converting DOM trees back to HTML text. \
841 Each test provides a sequence of tokens (start tags, end tags, text, comments, doctypes) and one \
842 or more valid serialized outputs. Tests cover attribute quoting, boolean attribute minimization, \
843 self-closing tag syntax (trailing solidus), entity escaping, whitespace handling, meta charset \
844 injection, and optional tag omission rules as specified in the HTML Standard. Multiple expected \
845 outputs allow for valid variations in serialization style.";
846 files = List.rev !file_results;
847 total_passed = !total_passed;
848 total_failed = !total_failed;
849 match_quality = None;
850 test_type_breakdown = None;
851 strictness_mode = None;
852 run_timestamp = None;
853 } in
854 Report.generate_report report "test_serializer_report.html";
855
856 exit (if !total_failed > 0 then 1 else 0)