test/test_serializer.ml at 81c4816404ceafd6d88e08303e3870f364dc0a32 · anil.recoil.org/ocaml-html5rw

OCaml HTML5 parser/serialiser based on Python's JustHTML
ocaml-html5rw / test / test_serializer.ml
at 81c4816404ceafd6d88e08303e3870f364dc0a32 32 kB view raw
  1(* Test runner for html5lib-tests serializer tests *)
  2
  3module Dom = Html5rw.Dom
  4module Report = Test_report
  5
  6(* Extract values from JSON *)
  7let json_string = function
  8  | Jsont.String (s, _) -> s
  9  | _ -> failwith "Expected string"
 10
 11let json_string_opt = function
 12  | Jsont.Null _ -> None
 13  | Jsont.String (s, _) -> Some s
 14  | _ -> failwith "Expected string or null"
 15
 16let json_bool = function
 17  | Jsont.Bool (b, _) -> b
 18  | _ -> failwith "Expected bool"
 19
 20let json_array = function
 21  | Jsont.Array (arr, _) -> arr
 22  | _ -> failwith "Expected array"
 23
 24let json_object = function
 25  | Jsont.Object (obj, _) -> obj
 26  | _ -> failwith "Expected object"
 27
 28let json_mem name obj =
 29  match List.find_opt (fun ((n, _), _) -> n = name) obj with
 30  | Some (_, v) -> Some v
 31  | None -> None
 32
 33let json_mem_exn name obj =
 34  match json_mem name obj with
 35  | Some v -> v
 36  | None -> failwith ("Missing member: " ^ name)
 37
 38let rec json_to_string = function
 39  | Jsont.Null _ -> "null"
 40  | Jsont.Bool (b, _) -> string_of_bool b
 41  | Jsont.Number (n, _) -> Printf.sprintf "%g" n
 42  | Jsont.String (s, _) -> Printf.sprintf "%S" s
 43  | Jsont.Array (arr, _) ->
 44    "[" ^ String.concat ", " (List.map json_to_string arr) ^ "]"
 45  | Jsont.Object (obj, _) ->
 46    "{" ^ String.concat ", " (List.map (fun ((n, _), v) -> Printf.sprintf "%S: %s" n (json_to_string v)) obj) ^ "}"
 47
 48(* Serialization options *)
 49type serialize_options = {
 50  quote_char : char;
 51  quote_char_explicit : bool;  (* Was quote_char explicitly set? *)
 52  minimize_boolean_attributes : bool;
 53  use_trailing_solidus : bool;
 54  escape_lt_in_attrs : bool;
 55  escape_rcdata : bool;
 56  strip_whitespace : bool;
 57  inject_meta_charset : bool;
 58  encoding : string option;
 59  omit_optional_tags : bool;
 60}
 61
 62let default_options = {
 63  quote_char = '"';
 64  quote_char_explicit = false;
 65  minimize_boolean_attributes = true;
 66  use_trailing_solidus = false;
 67  escape_lt_in_attrs = false;
 68  escape_rcdata = false;
 69  strip_whitespace = false;
 70  inject_meta_charset = false;
 71  encoding = None;
 72  omit_optional_tags = true;  (* HTML5 default *)
 73}
 74
 75(* Parse options from JSON *)
 76let parse_options json_opt =
 77  match json_opt with
 78  | None -> default_options
 79  | Some json ->
 80    let obj = json_object json in
 81    let get_bool name default =
 82      match json_mem name obj with
 83      | Some j -> (try json_bool j with _ -> default)
 84      | None -> default
 85    in
 86    let get_string name =
 87      match json_mem name obj with
 88      | Some (Jsont.String (s, _)) -> Some s
 89      | _ -> None
 90    in
 91    let quote_char_opt =
 92      match json_mem "quote_char" obj with
 93      | Some (Jsont.String (s, _)) when String.length s = 1 -> Some s.[0]
 94      | _ -> None
 95    in
 96    {
 97      quote_char = Option.value ~default:'"' quote_char_opt;
 98      quote_char_explicit = Option.is_some quote_char_opt;
 99      minimize_boolean_attributes = get_bool "minimize_boolean_attributes" (get_bool "quote_attr_values" true);
100      use_trailing_solidus = get_bool "use_trailing_solidus" false;
101      escape_lt_in_attrs = get_bool "escape_lt_in_attrs" false;
102      escape_rcdata = get_bool "escape_rcdata" false;
103      strip_whitespace = get_bool "strip_whitespace" false;
104      inject_meta_charset = get_bool "inject_meta_charset" false;
105      encoding = get_string "encoding";
106      omit_optional_tags = get_bool "omit_optional_tags" true;
107    }
108
109(* Test case *)
110type test_case = {
111  description : string;
112  input : Jsont.json list;
113  expected : string list;
114  options : serialize_options;
115  raw_json : string;  (* Original JSON representation of this test *)
116}
117
118let parse_test_case json =
119  let raw_json = json_to_string json in
120  let obj = json_object json in
121  let description = json_string (json_mem_exn "description" obj) in
122  let input = json_array (json_mem_exn "input" obj) in
123  let expected = List.map json_string (json_array (json_mem_exn "expected" obj)) in
124  let options = parse_options (json_mem "options" obj) in
125  { description; input; expected; options; raw_json }
126
127(* Parse attrs that can be either array [{name, value}] or object {name: value} or empty {} *)
128let parse_attrs attrs_json =
129  match attrs_json with
130  | Jsont.Array (arr, _) ->
131    List.map (fun attr_json ->
132      let attr_obj = json_object attr_json in
133      let attr_name = json_string (json_mem_exn "name" attr_obj) in
134      let value = json_string (json_mem_exn "value" attr_obj) in
135      (attr_name, value)
136    ) arr
137  | Jsont.Object (obj, _) ->
138    List.map (fun ((n, _), v) -> (n, json_string v)) obj
139  | _ -> []
140
141(* Void elements that don't need end tags *)
142let is_void_element name =
143  List.mem (String.lowercase_ascii name)
144    ["area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input";
145     "link"; "meta"; "param"; "source"; "track"; "wbr"]
146
147(* Raw text elements whose content should not be escaped *)
148let is_raw_text_element name =
149  List.mem (String.lowercase_ascii name) ["script"; "style"]
150
151(* Elements where whitespace should be preserved *)
152let is_whitespace_preserving_element name =
153  List.mem (String.lowercase_ascii name) ["pre"; "textarea"; "script"; "style"]
154
155(* Block elements that close a p tag *)
156let p_closing_elements = [
157  "address"; "article"; "aside"; "blockquote"; "datagrid"; "dialog"; "dir";
158  "div"; "dl"; "fieldset"; "footer"; "form"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6";
159  "header"; "hgroup"; "hr"; "main"; "menu"; "nav"; "ol"; "p"; "pre"; "section";
160  "table"; "ul"
161]
162
163let is_p_closing_element name =
164  List.mem (String.lowercase_ascii name) p_closing_elements
165
166(* Collapse runs of whitespace to single space *)
167let collapse_whitespace text =
168  let len = String.length text in
169  let buf = Buffer.create len in
170  let in_whitespace = ref false in
171  for i = 0 to len - 1 do
172    let c = text.[i] in
173    if c = '\t' || c = '\r' || c = '\n' || c = '\x0C' || c = ' ' then begin
174      if not !in_whitespace then begin
175        Buffer.add_char buf ' ';
176        in_whitespace := true
177      end
178    end else begin
179      Buffer.add_char buf c;
180      in_whitespace := false
181    end
182  done;
183  Buffer.contents buf
184
185(* Token types for stream-based serialization *)
186type token_type =
187  | StartTag of string * (string * string) list  (* name, attrs *)
188  | EndTag of string  (* name *)
189  | EmptyTag of string * (string * string) list  (* name, attrs *)
190  | TextNode of string
191  | CommentNode of string
192  | DoctypeNode of Html5rw.Dom.node
193
194type token_info = {
195  token : token_type option;
196}
197
198let build_token_info token =
199  let arr = json_array token in
200  match arr with
201  | [] -> { token = None }
202  | type_json :: rest ->
203    let token_type_str = json_string type_json in
204    match token_type_str, rest with
205    | "StartTag", [_ns_json; name_json; attrs_json] ->
206      let name = json_string name_json in
207      let attrs = parse_attrs attrs_json in
208      { token = Some (StartTag (name, attrs)) }
209
210    | "StartTag", [name_json; attrs_json] ->
211      let name = json_string name_json in
212      let attrs = parse_attrs attrs_json in
213      { token = Some (StartTag (name, attrs)) }
214
215    | "EmptyTag", [name_json; attrs_json] ->
216      let name = json_string name_json in
217      let attrs = parse_attrs attrs_json in
218      { token = Some (EmptyTag (name, attrs)) }
219
220    | "EndTag", [_ns_json; name_json] ->
221      let name = json_string name_json in
222      { token = Some (EndTag name) }
223
224    | "EndTag", [name_json] ->
225      let name = json_string name_json in
226      { token = Some (EndTag name) }
227
228    | "Characters", [text_json] ->
229      let text = json_string text_json in
230      { token = Some (TextNode text) }
231
232    | "Comment", [text_json] ->
233      let text = json_string text_json in
234      { token = Some (CommentNode text) }
235
236    | "Doctype", [name_json] ->
237      let name = json_string name_json in
238      let node = Html5rw.Dom.create_doctype ~name () in
239      { token = Some (DoctypeNode node) }
240
241    | "Doctype", [name_json; public_json] ->
242      let name = json_string name_json in
243      let public_id = json_string_opt public_json in
244      let node = match public_id with
245       | Some pub -> Html5rw.Dom.create_doctype ~name ~public_id:pub ()
246       | None -> Html5rw.Dom.create_doctype ~name ()
247      in
248      { token = Some (DoctypeNode node) }
249
250    | "Doctype", [name_json; public_json; system_json] ->
251      let name = json_string name_json in
252      let public_id = json_string_opt public_json in
253      let system_id = json_string_opt system_json in
254      let node = match public_id, system_id with
255       | Some pub, Some sys -> Html5rw.Dom.create_doctype ~name ~public_id:pub ~system_id:sys ()
256       | Some pub, None -> Html5rw.Dom.create_doctype ~name ~public_id:pub ()
257       | None, Some sys -> Html5rw.Dom.create_doctype ~name ~system_id:sys ()
258       | None, None -> Html5rw.Dom.create_doctype ~name ()
259      in
260      { token = Some (DoctypeNode node) }
261
262    | _ -> { token = None }
263
264(* Serialize a single node to HTML with options *)
265let escape_text text =
266  let buf = Buffer.create (String.length text) in
267  String.iter (fun c ->
268    match c with
269    | '&' -> Buffer.add_string buf "&amp;"
270    | '<' -> Buffer.add_string buf "&lt;"
271    | '>' -> Buffer.add_string buf "&gt;"
272    | c -> Buffer.add_char buf c
273  ) text;
274  Buffer.contents buf
275
276let can_unquote_attr_value value =
277  if String.length value = 0 then false
278  else
279    let valid = ref true in
280    String.iter (fun c ->
281      if c = '>' || c = '"' || c = '\'' || c = '=' || c = '`' ||
282         c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' then
283        valid := false
284    ) value;
285    !valid
286
287let escape_attr_value value quote_char escape_lt =
288  let buf = Buffer.create (String.length value) in
289  String.iter (fun c ->
290    match c with
291    | '&' -> Buffer.add_string buf "&amp;"
292    | '"' when quote_char = '"' -> Buffer.add_string buf "&quot;"
293    | '\'' when quote_char = '\'' -> Buffer.add_string buf "&#39;"
294    | '<' when escape_lt -> Buffer.add_string buf "&lt;"
295    | c -> Buffer.add_char buf c
296  ) value;
297  Buffer.contents buf
298
299let serialize_node opts ~in_raw_text node =
300  match node.Dom.name with
301  | "#text" ->
302    if in_raw_text && not opts.escape_rcdata then
303      node.Dom.data
304    else
305      escape_text node.Dom.data
306  | "#comment" ->
307    "<!--" ^ node.Dom.data ^ "-->"
308  | "!doctype" ->
309    let buf = Buffer.create 64 in
310    Buffer.add_string buf "<!DOCTYPE ";
311    (match node.Dom.doctype with
312     | Some dt ->
313       Buffer.add_string buf (Option.value ~default:"html" dt.Dom.name);
314       (match dt.Dom.public_id with
315        | Some pub when pub <> "" ->
316          Buffer.add_string buf " PUBLIC \"";
317          Buffer.add_string buf pub;
318          Buffer.add_char buf '"';
319          (match dt.Dom.system_id with
320           | Some sys ->
321             Buffer.add_string buf " \"";
322             Buffer.add_string buf sys;
323             Buffer.add_char buf '"'
324           | None -> ())
325        | _ ->
326          match dt.Dom.system_id with
327          | Some sys when sys <> "" ->
328            Buffer.add_string buf " SYSTEM \"";
329            Buffer.add_string buf sys;
330            Buffer.add_char buf '"'
331          | _ -> ())
332     | None -> Buffer.add_string buf "html");
333    Buffer.add_char buf '>';
334    Buffer.contents buf
335  | _ -> failwith "serialize_node called with element"
336
337let choose_quote value default_quote explicit =
338  (* If quote_char was explicitly set, always use it *)
339  if explicit then default_quote
340  else
341    (* Otherwise, if value contains the default quote but not the other, use the other *)
342    let has_double = String.contains value '"' in
343    let has_single = String.contains value '\'' in
344    if has_double && not has_single then '\''
345    else if has_single && not has_double then '"'
346    else default_quote
347
348(* Serialize an element tag (start tag) *)
349let serialize_start_tag opts ~is_empty_tag name attrs =
350  let buf = Buffer.create 64 in
351  Buffer.add_char buf '<';
352  Buffer.add_string buf name;
353  (* Sort attributes alphabetically for consistent output *)
354  let sorted_attrs = List.sort (fun (a, _) (b, _) -> String.compare a b) attrs in
355  List.iter (fun (key, value) ->
356    Buffer.add_char buf ' ';
357    Buffer.add_string buf key;
358    let should_minimize =
359      opts.minimize_boolean_attributes &&
360      String.lowercase_ascii key = String.lowercase_ascii value
361    in
362    if should_minimize then
363      ()
364    else if String.length value = 0 then begin
365      Buffer.add_char buf '=';
366      Buffer.add_char buf opts.quote_char;
367      Buffer.add_char buf opts.quote_char
368    end else if can_unquote_attr_value value then begin
369      Buffer.add_char buf '=';
370      Buffer.add_string buf value
371    end else begin
372      let quote = choose_quote value opts.quote_char opts.quote_char_explicit in
373      Buffer.add_char buf '=';
374      Buffer.add_char buf quote;
375      Buffer.add_string buf (escape_attr_value value quote opts.escape_lt_in_attrs);
376      Buffer.add_char buf quote
377    end
378  ) sorted_attrs;
379  if opts.use_trailing_solidus && (is_empty_tag || is_void_element name) then
380    Buffer.add_string buf " /";
381  Buffer.add_char buf '>';
382  Buffer.contents buf
383
384(* Check if text starts with ASCII whitespace *)
385let text_starts_with_space text =
386  String.length text > 0 &&
387  let c = text.[0] in
388  c = '\t' || c = '\n' || c = '\x0C' || c = '\r' || c = ' '
389
390(* Optional tag omission helpers *)
391type next_token =
392  | NTComment
393  | NTSpace  (* Text starting with space *)
394  | NTText   (* Text not starting with space *)
395  | NTStartTag of string
396  | NTEmptyTag of string
397  | NTEndTag of string
398  | NTEOF
399
400let classify_next tokens idx =
401  if idx >= Array.length tokens then NTEOF
402  else match tokens.(idx).token with
403    | None -> NTEOF
404    | Some (CommentNode _) -> NTComment
405    | Some (TextNode text) ->
406      if text_starts_with_space text then NTSpace else NTText
407    | Some (StartTag (name, _)) -> NTStartTag (String.lowercase_ascii name)
408    | Some (EmptyTag (name, _)) -> NTEmptyTag (String.lowercase_ascii name)
409    | Some (EndTag name) -> NTEndTag (String.lowercase_ascii name)
410    | Some (DoctypeNode _) -> NTEOF  (* Treat doctype as if nothing follows *)
411
412(* Should we omit a start tag? *)
413let should_omit_start_tag opts name attrs next =
414  if not opts.omit_optional_tags then false
415  else
416    let name = String.lowercase_ascii name in
417    match name, next with
418    (* html start tag: omit if not followed by comment or space, AND has no attributes *)
419    | "html", NTComment -> false
420    | "html", NTSpace -> false
421    | "html", _ -> attrs = []  (* only omit if no attributes *)
422    (* head start tag: omit if followed by element (start/empty tag) *)
423    | "head", NTStartTag _ -> true
424    | "head", NTEmptyTag _ -> true
425    | "head", NTEndTag "head" -> true  (* empty head *)
426    | "head", NTEOF -> true
427    | "head", _ -> false
428    (* body start tag: omit if not followed by comment or space, AND has no attributes *)
429    | "body", NTComment -> false
430    | "body", NTSpace -> false
431    | "body", _ -> attrs = []  (* only omit if no attributes *)
432    (* colgroup start tag: omit if followed by col element *)
433    | "colgroup", NTStartTag "col" -> true
434    | "colgroup", NTEmptyTag "col" -> true
435    | "colgroup", _ -> false
436    (* tbody start tag: omit if first child is tr *)
437    | "tbody", NTStartTag "tr" -> true
438    | "tbody", _ -> false
439    | _ -> false
440
441(* Should we omit an end tag? *)
442let should_omit_end_tag opts name next =
443  if not opts.omit_optional_tags then false
444  else
445    let name = String.lowercase_ascii name in
446    match name, next with
447    (* html end tag: omit if not followed by comment or space *)
448    | "html", NTComment -> false
449    | "html", NTSpace -> false
450    | "html", _ -> true
451    (* head end tag: omit if not followed by comment or space *)
452    | "head", NTComment -> false
453    | "head", NTSpace -> false
454    | "head", _ -> true
455    (* body end tag: omit if not followed by comment or space *)
456    | "body", NTComment -> false
457    | "body", NTSpace -> false
458    | "body", _ -> true
459    (* li end tag: omit if followed by li start tag or parent end tag *)
460    | "li", NTStartTag "li" -> true
461    | "li", NTEndTag _ -> true
462    | "li", NTEOF -> true
463    | "li", _ -> false
464    (* dt end tag: omit if followed by dt or dd start tag (NOT end tag or EOF!) *)
465    | "dt", NTStartTag "dt" -> true
466    | "dt", NTStartTag "dd" -> true
467    | "dt", _ -> false
468    (* dd end tag: omit if followed by dd or dt start tag, or end tag, or EOF *)
469    | "dd", NTStartTag "dd" -> true
470    | "dd", NTStartTag "dt" -> true
471    | "dd", NTEndTag _ -> true
472    | "dd", NTEOF -> true
473    | "dd", _ -> false
474    (* p end tag: omit if followed by block element (start or empty tag), end tag, or EOF *)
475    | "p", NTStartTag next_name when is_p_closing_element next_name -> true
476    | "p", NTEmptyTag next_name when is_p_closing_element next_name -> true
477    | "p", NTEndTag _ -> true
478    | "p", NTEOF -> true
479    | "p", _ -> false
480    (* optgroup end tag: omit if followed by optgroup start tag, end tag, or EOF *)
481    | "optgroup", NTStartTag "optgroup" -> true
482    | "optgroup", NTEndTag _ -> true
483    | "optgroup", NTEOF -> true
484    | "optgroup", _ -> false
485    (* option end tag: omit if followed by option/optgroup start tag, end tag, or EOF *)
486    | "option", NTStartTag "option" -> true
487    | "option", NTStartTag "optgroup" -> true
488    | "option", NTEndTag _ -> true
489    | "option", NTEOF -> true
490    | "option", _ -> false
491    (* colgroup end tag: omit if not followed by comment, space, or another colgroup *)
492    | "colgroup", NTComment -> false
493    | "colgroup", NTSpace -> false
494    | "colgroup", NTStartTag "colgroup" -> false  (* keep end tag when another colgroup follows *)
495    | "colgroup", _ -> true
496    (* thead end tag: omit if followed by tbody or tfoot start tag *)
497    | "thead", NTStartTag "tbody" -> true
498    | "thead", NTStartTag "tfoot" -> true
499    | "thead", _ -> false
500    (* tbody end tag: omit if followed by tbody/tfoot start tag, end tag, or EOF *)
501    | "tbody", NTStartTag "tbody" -> true
502    | "tbody", NTStartTag "tfoot" -> true
503    | "tbody", NTEndTag _ -> true
504    | "tbody", NTEOF -> true
505    | "tbody", _ -> false
506    (* tfoot end tag: omit if followed by tbody start tag, end tag, or EOF *)
507    | "tfoot", NTStartTag "tbody" -> true
508    | "tfoot", NTEndTag _ -> true
509    | "tfoot", NTEOF -> true
510    | "tfoot", _ -> false
511    (* tr end tag: omit if followed by tr start tag, end tag, or EOF *)
512    | "tr", NTStartTag "tr" -> true
513    | "tr", NTEndTag _ -> true
514    | "tr", NTEOF -> true
515    | "tr", _ -> false
516    (* td end tag: omit if followed by td/th start tag, end tag, or EOF *)
517    | "td", NTStartTag "td" -> true
518    | "td", NTStartTag "th" -> true
519    | "td", NTEndTag _ -> true
520    | "td", NTEOF -> true
521    | "td", _ -> false
522    (* th end tag: omit if followed by th/td start tag, end tag, or EOF *)
523    | "th", NTStartTag "th" -> true
524    | "th", NTStartTag "td" -> true
525    | "th", NTEndTag _ -> true
526    | "th", NTEOF -> true
527    | "th", _ -> false
528    | _ -> false
529
530(* Run a single test *)
531let run_test test =
532  try
533    (* Build token infos from input *)
534    let token_infos = Array.of_list (List.map build_token_info test.input) in
535    let num_tokens = Array.length token_infos in
536
537    (* Handle inject_meta_charset option *)
538    let inject_meta = test.options.inject_meta_charset in
539    let encoding = test.options.encoding in
540
541    (* Serialize with context tracking *)
542    let buf = Buffer.create 256 in
543    let in_raw_text = ref false in
544    let preserve_whitespace = ref false in
545    let element_stack : string list ref = ref [] in
546    let in_head = ref false in
547    let meta_charset_injected = ref false in
548    let prev_was_section_end = ref false in  (* Track if prev token was thead/tbody/tfoot end *)
549
550    for i = 0 to num_tokens - 1 do
551      let info = token_infos.(i) in
552      let next = classify_next token_infos (i + 1) in
553
554      match info.token with
555      | None -> ()
556
557      | Some (StartTag (name, attrs)) ->
558        let name_lower = String.lowercase_ascii name in
559
560        (* Track head element *)
561        if name_lower = "head" then in_head := true;
562
563        (* For inject_meta_charset, we need to check if there's any charset meta coming up *)
564        (* If yes, don't inject at head start; if no, inject at head start *)
565        let should_inject_at_head =
566          if not inject_meta || name_lower <> "head" then false
567          else match encoding with
568            | None -> false
569            | Some _ ->
570              (* Look ahead to see if there's a charset meta or http-equiv content-type *)
571              let has_charset_meta = ref false in
572              for j = i + 1 to num_tokens - 1 do
573                match token_infos.(j).token with
574                | Some (EmptyTag (n, a)) when String.lowercase_ascii n = "meta" ->
575                  let has_charset = List.exists (fun (k, _) -> String.lowercase_ascii k = "charset") a in
576                  let has_http_equiv_ct = List.exists (fun (k, v) ->
577                    String.lowercase_ascii k = "http-equiv" && String.lowercase_ascii v = "content-type") a in
578                  if has_charset || has_http_equiv_ct then has_charset_meta := true
579                | Some (EndTag n) when String.lowercase_ascii n = "head" -> ()
580                | _ -> ()
581              done;
582              not !has_charset_meta
583        in
584
585        (* Special case: tbody start tag cannot be omitted if preceded by section end tag *)
586        let can_omit_start =
587          if name_lower = "tbody" && !prev_was_section_end then false
588          else should_omit_start_tag test.options name attrs next
589        in
590        prev_was_section_end := false;  (* Reset for next iteration *)
591
592        if should_inject_at_head then begin
593          meta_charset_injected := true;
594          (* Don't output head start tag if we should omit it *)
595          if not can_omit_start then
596            Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:false name attrs);
597          Buffer.add_string buf (Printf.sprintf "<meta charset=%s>" (Option.get encoding));
598          element_stack := name :: !element_stack;
599          if is_raw_text_element name then in_raw_text := true;
600          if is_whitespace_preserving_element name then preserve_whitespace := true
601        end else if not can_omit_start then begin
602          Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:false name attrs);
603          element_stack := name :: !element_stack;
604          if is_raw_text_element name then in_raw_text := true;
605          if is_whitespace_preserving_element name then preserve_whitespace := true
606        end else begin
607          element_stack := name :: !element_stack;
608          if is_raw_text_element name then in_raw_text := true;
609          if is_whitespace_preserving_element name then preserve_whitespace := true
610        end
611
612      | Some (EmptyTag (name, attrs)) ->
613        let name_lower = String.lowercase_ascii name in
614        prev_was_section_end := false;  (* Reset for next iteration *)
615
616        (* Handle meta charset replacement *)
617        if inject_meta && !in_head && name_lower = "meta" then begin
618          let has_charset = List.exists (fun (k, _) -> String.lowercase_ascii k = "charset") attrs in
619          let has_http_equiv_ct =
620            List.exists (fun (k, v) ->
621              String.lowercase_ascii k = "http-equiv" &&
622              String.lowercase_ascii v = "content-type"
623            ) attrs
624          in
625          if has_charset then begin
626            (* Replace charset value *)
627            match encoding with
628            | Some enc ->
629              Buffer.add_string buf (Printf.sprintf "<meta charset=%s>" enc)
630            | None -> ()  (* No encoding, skip the meta tag *)
631          end else if has_http_equiv_ct then begin
632            (* Replace charset in content value *)
633            match encoding with
634            | Some enc ->
635              let new_attrs = List.map (fun (k, v) ->
636                if String.lowercase_ascii k = "content" then
637                  let new_content = Printf.sprintf "text/html; charset=%s" enc in
638                  (k, new_content)
639                else (k, v)
640              ) attrs in
641              Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name new_attrs)
642            | None ->
643              Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name attrs)
644          end else begin
645            (* Regular meta tag, output as normal *)
646            Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name attrs)
647          end
648        end else
649          Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name attrs)
650
651      | Some (EndTag name) ->
652        let name_lower = String.lowercase_ascii name in
653
654        (* Track head element *)
655        if name_lower = "head" then in_head := false;
656
657        (* Pop from element stack *)
658        (match !element_stack with
659         | top :: rest when String.lowercase_ascii top = name_lower ->
660           element_stack := rest;
661           if is_raw_text_element name then in_raw_text := false;
662           if is_whitespace_preserving_element name then preserve_whitespace := false
663         | _ -> ());
664
665        let is_section_end = List.mem name_lower ["thead"; "tbody"; "tfoot"] in
666        let omit = should_omit_end_tag test.options name next in
667
668        if not omit then begin
669          Buffer.add_string buf "</";
670          Buffer.add_string buf name;
671          Buffer.add_char buf '>'
672        end;
673
674        (* Track if we omitted a section end tag - next tbody can't be omitted *)
675        prev_was_section_end := is_section_end && omit
676
677      | Some (TextNode text) ->
678        prev_was_section_end := false;
679        let processed_text =
680          if !in_raw_text && not test.options.escape_rcdata then
681            text
682          else if test.options.strip_whitespace && not !preserve_whitespace then
683            escape_text (collapse_whitespace text)
684          else
685            escape_text text
686        in
687        Buffer.add_string buf processed_text
688
689      | Some (CommentNode text) ->
690        prev_was_section_end := false;
691        Buffer.add_string buf "<!--";
692        Buffer.add_string buf text;
693        Buffer.add_string buf "-->"
694
695      | Some (DoctypeNode node) ->
696        prev_was_section_end := false;
697        Buffer.add_string buf (serialize_node test.options ~in_raw_text:false node)
698    done;
699
700    let serialized = Buffer.contents buf in
701
702    (* Check if it matches any expected output *)
703    let matches = List.exists (fun exp -> serialized = exp) test.expected in
704
705    (matches, serialized, test.expected)
706  with e ->
707    (false, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), test.expected)
708
709(* Run all tests in a file *)
710let format_options opts =
711  let parts = [] in
712  let parts = if opts.quote_char_explicit then
713    Printf.sprintf "quote_char='%c'" opts.quote_char :: parts else parts in
714  let parts = if not opts.minimize_boolean_attributes then
715    "minimize_bool=false" :: parts else parts in
716  let parts = if opts.use_trailing_solidus then
717    "trailing_solidus=true" :: parts else parts in
718  let parts = if opts.escape_lt_in_attrs then
719    "escape_lt=true" :: parts else parts in
720  let parts = if opts.escape_rcdata then
721    "escape_rcdata=true" :: parts else parts in
722  let parts = if opts.strip_whitespace then
723    "strip_ws=true" :: parts else parts in
724  let parts = if opts.inject_meta_charset then
725    "inject_charset=true" :: parts else parts in
726  let parts = if not opts.omit_optional_tags then
727    "omit_tags=false" :: parts else parts in
728  if parts = [] then "(defaults)" else String.concat ", " (List.rev parts)
729
730let run_file path =
731  let content =
732    let ic = open_in path in
733    let n = in_channel_length ic in
734    let s = really_input_string ic n in
735    close_in ic;
736    s
737  in
738
739  let json = match Jsont_bytesrw.decode_string Jsont.json content with
740    | Ok j -> j
741    | Error e -> failwith (Printf.sprintf "JSON parse error in %s: %s" path e)
742  in
743
744  let obj = json_object json in
745  let tests_json = match json_mem "tests" obj with
746    | Some t -> json_array t
747    | None -> []
748  in
749
750  let filename = Filename.basename path in
751  let passed = ref 0 in
752  let failed = ref 0 in
753  let results = ref [] in
754
755  List.iteri (fun i test_json ->
756    try
757      let test = parse_test_case test_json in
758      let (success, actual, expected) = run_test test in
759
760      let result : Report.test_result = {
761        test_num = i + 1;
762        description = test.description;
763        input = String.concat "\n" (List.map (fun tok ->
764          (* Simplified token representation *)
765          match tok with
766          | Jsont.Array (arr, _) ->
767            (match arr with
768             | Jsont.String (ty, _) :: rest ->
769               Printf.sprintf "%s: %s" ty (String.concat ", " (List.map (function
770                 | Jsont.String (s, _) -> Printf.sprintf "%S" s
771                 | Jsont.Object _ -> "{...}"
772                 | Jsont.Null _ -> "null"
773                 | _ -> "?"
774               ) rest))
775             | _ -> "?")
776          | _ -> "?"
777        ) test.input);
778        expected = String.concat " | " expected;
779        actual;
780        success;
781        details = [
782          ("Options", format_options test.options);
783          ("Expected Variants", string_of_int (List.length expected));
784        ];
785        raw_test_data = Some test.raw_json;
786      } in
787      results := result :: !results;
788
789      if success then incr passed else incr failed
790    with e ->
791      incr failed;
792      let result : Report.test_result = {
793        test_num = i + 1;
794        description = Printf.sprintf "Test %d" (i + 1);
795        input = "";
796        expected = "";
797        actual = Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e);
798        success = false;
799        details = [];
800        raw_test_data = Some (json_to_string test_json);
801      } in
802      results := result :: !results;
803      Printf.eprintf "Exception parsing test %d: %s\n" (i + 1) (Printexc.to_string e)
804  ) tests_json;
805
806  let file_result : Report.file_result = {
807    filename;
808    test_type = "Serializer";
809    passed_count = !passed;
810    failed_count = !failed;
811    tests = List.rev !results;
812  } in
813  (file_result, !passed, !failed)
814
815let () =
816  let test_dir = Sys.argv.(1) in
817  let files = Sys.readdir test_dir |> Array.to_list in
818  let test_files = List.filter (fun f -> Filename.check_suffix f ".test") files in
819
820  let total_passed = ref 0 in
821  let total_failed = ref 0 in
822  let file_results = ref [] in
823
824  List.iter (fun file ->
825    let path = Filename.concat test_dir file in
826    let (file_result, passed, failed) = run_file path in
827    total_passed := !total_passed + passed;
828    total_failed := !total_failed + failed;
829    file_results := file_result :: !file_results;
830    Printf.printf "%s: %d passed, %d failed\n" file passed failed
831  ) (List.sort String.compare test_files);
832
833  Printf.printf "\n=== Summary ===\n";
834  Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed;
835
836  (* Generate HTML report *)
837  let report : Report.report = {
838    title = "HTML5 Serializer Tests";
839    test_type = "serializer";
840    description = "These tests validate the HTML serialization algorithm for converting DOM trees back to HTML text. \
841                   Each test provides a sequence of tokens (start tags, end tags, text, comments, doctypes) and one \
842                   or more valid serialized outputs. Tests cover attribute quoting, boolean attribute minimization, \
843                   self-closing tag syntax (trailing solidus), entity escaping, whitespace handling, meta charset \
844                   injection, and optional tag omission rules as specified in the HTML Standard. Multiple expected \
845                   outputs allow for valid variations in serialization style.";
846    files = List.rev !file_results;
847    total_passed = !total_passed;
848    total_failed = !total_failed;
849  } in
850  Report.generate_report report "test_serializer_report.html";
851
852  exit (if !total_failed > 0 then 1 else 0)