OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* Test runner for html5lib-tests serializer tests *) 2 3module Dom = Html5rw.Dom 4module Report = Test_report 5 6(* Extract values from JSON *) 7let json_string = function 8 | Jsont.String (s, _) -> s 9 | _ -> failwith "Expected string" 10 11let json_string_opt = function 12 | Jsont.Null _ -> None 13 | Jsont.String (s, _) -> Some s 14 | _ -> failwith "Expected string or null" 15 16let json_bool = function 17 | Jsont.Bool (b, _) -> b 18 | _ -> failwith "Expected bool" 19 20let json_array = function 21 | Jsont.Array (arr, _) -> arr 22 | _ -> failwith "Expected array" 23 24let json_object = function 25 | Jsont.Object (obj, _) -> obj 26 | _ -> failwith "Expected object" 27 28let json_mem name obj = 29 match List.find_opt (fun ((n, _), _) -> n = name) obj with 30 | Some (_, v) -> Some v 31 | None -> None 32 33let json_mem_exn name obj = 34 match json_mem name obj with 35 | Some v -> v 36 | None -> failwith ("Missing member: " ^ name) 37 38let rec json_to_string = function 39 | Jsont.Null _ -> "null" 40 | Jsont.Bool (b, _) -> string_of_bool b 41 | Jsont.Number (n, _) -> Printf.sprintf "%g" n 42 | Jsont.String (s, _) -> Printf.sprintf "%S" s 43 | Jsont.Array (arr, _) -> 44 "[" ^ String.concat ", " (List.map json_to_string arr) ^ "]" 45 | Jsont.Object (obj, _) -> 46 "{" ^ String.concat ", " (List.map (fun ((n, _), v) -> Printf.sprintf "%S: %s" n (json_to_string v)) obj) ^ "}" 47 48(* Serialization options *) 49type serialize_options = { 50 quote_char : char; 51 quote_char_explicit : bool; (* Was quote_char explicitly set? *) 52 minimize_boolean_attributes : bool; 53 use_trailing_solidus : bool; 54 escape_lt_in_attrs : bool; 55 escape_rcdata : bool; 56 strip_whitespace : bool; 57 inject_meta_charset : bool; 58 encoding : string option; 59 omit_optional_tags : bool; 60} 61 62let default_options = { 63 quote_char = '"'; 64 quote_char_explicit = false; 65 minimize_boolean_attributes = true; 66 use_trailing_solidus = false; 67 escape_lt_in_attrs = false; 68 escape_rcdata = false; 69 strip_whitespace = false; 70 inject_meta_charset = false; 71 encoding = None; 72 omit_optional_tags = true; (* HTML5 default *) 73} 74 75(* Parse options from JSON *) 76let parse_options json_opt = 77 match json_opt with 78 | None -> default_options 79 | Some json -> 80 let obj = json_object json in 81 let get_bool name default = 82 match json_mem name obj with 83 | Some j -> (try json_bool j with _ -> default) 84 | None -> default 85 in 86 let get_string name = 87 match json_mem name obj with 88 | Some (Jsont.String (s, _)) -> Some s 89 | _ -> None 90 in 91 let quote_char_opt = 92 match json_mem "quote_char" obj with 93 | Some (Jsont.String (s, _)) when String.length s = 1 -> Some s.[0] 94 | _ -> None 95 in 96 { 97 quote_char = Option.value ~default:'"' quote_char_opt; 98 quote_char_explicit = Option.is_some quote_char_opt; 99 minimize_boolean_attributes = get_bool "minimize_boolean_attributes" (get_bool "quote_attr_values" true); 100 use_trailing_solidus = get_bool "use_trailing_solidus" false; 101 escape_lt_in_attrs = get_bool "escape_lt_in_attrs" false; 102 escape_rcdata = get_bool "escape_rcdata" false; 103 strip_whitespace = get_bool "strip_whitespace" false; 104 inject_meta_charset = get_bool "inject_meta_charset" false; 105 encoding = get_string "encoding"; 106 omit_optional_tags = get_bool "omit_optional_tags" true; 107 } 108 109(* Test case *) 110type test_case = { 111 description : string; 112 input : Jsont.json list; 113 expected : string list; 114 options : serialize_options; 115 raw_json : string; (* Original JSON representation of this test *) 116} 117 118let parse_test_case json = 119 let raw_json = json_to_string json in 120 let obj = json_object json in 121 let description = json_string (json_mem_exn "description" obj) in 122 let input = json_array (json_mem_exn "input" obj) in 123 let expected = List.map json_string (json_array (json_mem_exn "expected" obj)) in 124 let options = parse_options (json_mem "options" obj) in 125 { description; input; expected; options; raw_json } 126 127(* Parse attrs that can be either array [{name, value}] or object {name: value} or empty {} *) 128let parse_attrs attrs_json = 129 match attrs_json with 130 | Jsont.Array (arr, _) -> 131 List.map (fun attr_json -> 132 let attr_obj = json_object attr_json in 133 let attr_name = json_string (json_mem_exn "name" attr_obj) in 134 let value = json_string (json_mem_exn "value" attr_obj) in 135 (attr_name, value) 136 ) arr 137 | Jsont.Object (obj, _) -> 138 List.map (fun ((n, _), v) -> (n, json_string v)) obj 139 | _ -> [] 140 141(* Void elements that don't need end tags *) 142let is_void_element name = 143 List.mem (String.lowercase_ascii name) 144 ["area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input"; 145 "link"; "meta"; "param"; "source"; "track"; "wbr"] 146 147(* Raw text elements whose content should not be escaped *) 148let is_raw_text_element name = 149 List.mem (String.lowercase_ascii name) ["script"; "style"] 150 151(* Elements where whitespace should be preserved *) 152let is_whitespace_preserving_element name = 153 List.mem (String.lowercase_ascii name) ["pre"; "textarea"; "script"; "style"] 154 155(* Block elements that close a p tag *) 156let p_closing_elements = [ 157 "address"; "article"; "aside"; "blockquote"; "datagrid"; "dialog"; "dir"; 158 "div"; "dl"; "fieldset"; "footer"; "form"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; 159 "header"; "hgroup"; "hr"; "main"; "menu"; "nav"; "ol"; "p"; "pre"; "section"; 160 "table"; "ul" 161] 162 163let is_p_closing_element name = 164 List.mem (String.lowercase_ascii name) p_closing_elements 165 166(* Collapse runs of whitespace to single space *) 167let collapse_whitespace text = 168 let len = String.length text in 169 let buf = Buffer.create len in 170 let in_whitespace = ref false in 171 for i = 0 to len - 1 do 172 let c = text.[i] in 173 if c = '\t' || c = '\r' || c = '\n' || c = '\x0C' || c = ' ' then begin 174 if not !in_whitespace then begin 175 Buffer.add_char buf ' '; 176 in_whitespace := true 177 end 178 end else begin 179 Buffer.add_char buf c; 180 in_whitespace := false 181 end 182 done; 183 Buffer.contents buf 184 185(* Token types for stream-based serialization *) 186type token_type = 187 | StartTag of string * (string * string) list (* name, attrs *) 188 | EndTag of string (* name *) 189 | EmptyTag of string * (string * string) list (* name, attrs *) 190 | TextNode of string 191 | CommentNode of string 192 | DoctypeNode of Html5rw.Dom.node 193 194type token_info = { 195 token : token_type option; 196} 197 198let build_token_info token = 199 let arr = json_array token in 200 match arr with 201 | [] -> { token = None } 202 | type_json :: rest -> 203 let token_type_str = json_string type_json in 204 match token_type_str, rest with 205 | "StartTag", [_ns_json; name_json; attrs_json] -> 206 let name = json_string name_json in 207 let attrs = parse_attrs attrs_json in 208 { token = Some (StartTag (name, attrs)) } 209 210 | "StartTag", [name_json; attrs_json] -> 211 let name = json_string name_json in 212 let attrs = parse_attrs attrs_json in 213 { token = Some (StartTag (name, attrs)) } 214 215 | "EmptyTag", [name_json; attrs_json] -> 216 let name = json_string name_json in 217 let attrs = parse_attrs attrs_json in 218 { token = Some (EmptyTag (name, attrs)) } 219 220 | "EndTag", [_ns_json; name_json] -> 221 let name = json_string name_json in 222 { token = Some (EndTag name) } 223 224 | "EndTag", [name_json] -> 225 let name = json_string name_json in 226 { token = Some (EndTag name) } 227 228 | "Characters", [text_json] -> 229 let text = json_string text_json in 230 { token = Some (TextNode text) } 231 232 | "Comment", [text_json] -> 233 let text = json_string text_json in 234 { token = Some (CommentNode text) } 235 236 | "Doctype", [name_json] -> 237 let name = json_string name_json in 238 let node = Html5rw.Dom.create_doctype ~name () in 239 { token = Some (DoctypeNode node) } 240 241 | "Doctype", [name_json; public_json] -> 242 let name = json_string name_json in 243 let public_id = json_string_opt public_json in 244 let node = match public_id with 245 | Some pub -> Html5rw.Dom.create_doctype ~name ~public_id:pub () 246 | None -> Html5rw.Dom.create_doctype ~name () 247 in 248 { token = Some (DoctypeNode node) } 249 250 | "Doctype", [name_json; public_json; system_json] -> 251 let name = json_string name_json in 252 let public_id = json_string_opt public_json in 253 let system_id = json_string_opt system_json in 254 let node = match public_id, system_id with 255 | Some pub, Some sys -> Html5rw.Dom.create_doctype ~name ~public_id:pub ~system_id:sys () 256 | Some pub, None -> Html5rw.Dom.create_doctype ~name ~public_id:pub () 257 | None, Some sys -> Html5rw.Dom.create_doctype ~name ~system_id:sys () 258 | None, None -> Html5rw.Dom.create_doctype ~name () 259 in 260 { token = Some (DoctypeNode node) } 261 262 | _ -> { token = None } 263 264(* Serialize a single node to HTML with options *) 265let escape_text text = 266 let buf = Buffer.create (String.length text) in 267 String.iter (fun c -> 268 match c with 269 | '&' -> Buffer.add_string buf "&amp;" 270 | '<' -> Buffer.add_string buf "&lt;" 271 | '>' -> Buffer.add_string buf "&gt;" 272 | c -> Buffer.add_char buf c 273 ) text; 274 Buffer.contents buf 275 276let can_unquote_attr_value value = 277 if String.length value = 0 then false 278 else 279 let valid = ref true in 280 String.iter (fun c -> 281 if c = '>' || c = '"' || c = '\'' || c = '=' || c = '`' || 282 c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' then 283 valid := false 284 ) value; 285 !valid 286 287let escape_attr_value value quote_char escape_lt = 288 let buf = Buffer.create (String.length value) in 289 String.iter (fun c -> 290 match c with 291 | '&' -> Buffer.add_string buf "&amp;" 292 | '"' when quote_char = '"' -> Buffer.add_string buf "&quot;" 293 | '\'' when quote_char = '\'' -> Buffer.add_string buf "&#39;" 294 | '<' when escape_lt -> Buffer.add_string buf "&lt;" 295 | c -> Buffer.add_char buf c 296 ) value; 297 Buffer.contents buf 298 299let serialize_node opts ~in_raw_text node = 300 match node.Dom.name with 301 | "#text" -> 302 if in_raw_text && not opts.escape_rcdata then 303 node.Dom.data 304 else 305 escape_text node.Dom.data 306 | "#comment" -> 307 "<!--" ^ node.Dom.data ^ "-->" 308 | "!doctype" -> 309 let buf = Buffer.create 64 in 310 Buffer.add_string buf "<!DOCTYPE "; 311 (match node.Dom.doctype with 312 | Some dt -> 313 Buffer.add_string buf (Option.value ~default:"html" dt.Dom.name); 314 (match dt.Dom.public_id with 315 | Some pub when pub <> "" -> 316 Buffer.add_string buf " PUBLIC \""; 317 Buffer.add_string buf pub; 318 Buffer.add_char buf '"'; 319 (match dt.Dom.system_id with 320 | Some sys -> 321 Buffer.add_string buf " \""; 322 Buffer.add_string buf sys; 323 Buffer.add_char buf '"' 324 | None -> ()) 325 | _ -> 326 match dt.Dom.system_id with 327 | Some sys when sys <> "" -> 328 Buffer.add_string buf " SYSTEM \""; 329 Buffer.add_string buf sys; 330 Buffer.add_char buf '"' 331 | _ -> ()) 332 | None -> Buffer.add_string buf "html"); 333 Buffer.add_char buf '>'; 334 Buffer.contents buf 335 | _ -> failwith "serialize_node called with element" 336 337let choose_quote value default_quote explicit = 338 (* If quote_char was explicitly set, always use it *) 339 if explicit then default_quote 340 else 341 (* Otherwise, if value contains the default quote but not the other, use the other *) 342 let has_double = String.contains value '"' in 343 let has_single = String.contains value '\'' in 344 if has_double && not has_single then '\'' 345 else if has_single && not has_double then '"' 346 else default_quote 347 348(* Serialize an element tag (start tag) *) 349let serialize_start_tag opts ~is_empty_tag name attrs = 350 let buf = Buffer.create 64 in 351 Buffer.add_char buf '<'; 352 Buffer.add_string buf name; 353 (* Sort attributes alphabetically for consistent output *) 354 let sorted_attrs = List.sort (fun (a, _) (b, _) -> String.compare a b) attrs in 355 List.iter (fun (key, value) -> 356 Buffer.add_char buf ' '; 357 Buffer.add_string buf key; 358 let should_minimize = 359 opts.minimize_boolean_attributes && 360 String.lowercase_ascii key = String.lowercase_ascii value 361 in 362 if should_minimize then 363 () 364 else if String.length value = 0 then begin 365 Buffer.add_char buf '='; 366 Buffer.add_char buf opts.quote_char; 367 Buffer.add_char buf opts.quote_char 368 end else if can_unquote_attr_value value then begin 369 Buffer.add_char buf '='; 370 Buffer.add_string buf value 371 end else begin 372 let quote = choose_quote value opts.quote_char opts.quote_char_explicit in 373 Buffer.add_char buf '='; 374 Buffer.add_char buf quote; 375 Buffer.add_string buf (escape_attr_value value quote opts.escape_lt_in_attrs); 376 Buffer.add_char buf quote 377 end 378 ) sorted_attrs; 379 if opts.use_trailing_solidus && (is_empty_tag || is_void_element name) then 380 Buffer.add_string buf " /"; 381 Buffer.add_char buf '>'; 382 Buffer.contents buf 383 384(* Check if text starts with ASCII whitespace *) 385let text_starts_with_space text = 386 String.length text > 0 && 387 let c = text.[0] in 388 c = '\t' || c = '\n' || c = '\x0C' || c = '\r' || c = ' ' 389 390(* Optional tag omission helpers *) 391type next_token = 392 | NTComment 393 | NTSpace (* Text starting with space *) 394 | NTText (* Text not starting with space *) 395 | NTStartTag of string 396 | NTEmptyTag of string 397 | NTEndTag of string 398 | NTEOF 399 400let classify_next tokens idx = 401 if idx >= Array.length tokens then NTEOF 402 else match tokens.(idx).token with 403 | None -> NTEOF 404 | Some (CommentNode _) -> NTComment 405 | Some (TextNode text) -> 406 if text_starts_with_space text then NTSpace else NTText 407 | Some (StartTag (name, _)) -> NTStartTag (String.lowercase_ascii name) 408 | Some (EmptyTag (name, _)) -> NTEmptyTag (String.lowercase_ascii name) 409 | Some (EndTag name) -> NTEndTag (String.lowercase_ascii name) 410 | Some (DoctypeNode _) -> NTEOF (* Treat doctype as if nothing follows *) 411 412(* Should we omit a start tag? *) 413let should_omit_start_tag opts name attrs next = 414 if not opts.omit_optional_tags then false 415 else 416 let name = String.lowercase_ascii name in 417 match name, next with 418 (* html start tag: omit if not followed by comment or space, AND has no attributes *) 419 | "html", NTComment -> false 420 | "html", NTSpace -> false 421 | "html", _ -> attrs = [] (* only omit if no attributes *) 422 (* head start tag: omit if followed by element (start/empty tag) *) 423 | "head", NTStartTag _ -> true 424 | "head", NTEmptyTag _ -> true 425 | "head", NTEndTag "head" -> true (* empty head *) 426 | "head", NTEOF -> true 427 | "head", _ -> false 428 (* body start tag: omit if not followed by comment or space, AND has no attributes *) 429 | "body", NTComment -> false 430 | "body", NTSpace -> false 431 | "body", _ -> attrs = [] (* only omit if no attributes *) 432 (* colgroup start tag: omit if followed by col element *) 433 | "colgroup", NTStartTag "col" -> true 434 | "colgroup", NTEmptyTag "col" -> true 435 | "colgroup", _ -> false 436 (* tbody start tag: omit if first child is tr *) 437 | "tbody", NTStartTag "tr" -> true 438 | "tbody", _ -> false 439 | _ -> false 440 441(* Should we omit an end tag? *) 442let should_omit_end_tag opts name next = 443 if not opts.omit_optional_tags then false 444 else 445 let name = String.lowercase_ascii name in 446 match name, next with 447 (* html end tag: omit if not followed by comment or space *) 448 | "html", NTComment -> false 449 | "html", NTSpace -> false 450 | "html", _ -> true 451 (* head end tag: omit if not followed by comment or space *) 452 | "head", NTComment -> false 453 | "head", NTSpace -> false 454 | "head", _ -> true 455 (* body end tag: omit if not followed by comment or space *) 456 | "body", NTComment -> false 457 | "body", NTSpace -> false 458 | "body", _ -> true 459 (* li end tag: omit if followed by li start tag or parent end tag *) 460 | "li", NTStartTag "li" -> true 461 | "li", NTEndTag _ -> true 462 | "li", NTEOF -> true 463 | "li", _ -> false 464 (* dt end tag: omit if followed by dt or dd start tag (NOT end tag or EOF!) *) 465 | "dt", NTStartTag "dt" -> true 466 | "dt", NTStartTag "dd" -> true 467 | "dt", _ -> false 468 (* dd end tag: omit if followed by dd or dt start tag, or end tag, or EOF *) 469 | "dd", NTStartTag "dd" -> true 470 | "dd", NTStartTag "dt" -> true 471 | "dd", NTEndTag _ -> true 472 | "dd", NTEOF -> true 473 | "dd", _ -> false 474 (* p end tag: omit if followed by block element (start or empty tag), end tag, or EOF *) 475 | "p", NTStartTag next_name when is_p_closing_element next_name -> true 476 | "p", NTEmptyTag next_name when is_p_closing_element next_name -> true 477 | "p", NTEndTag _ -> true 478 | "p", NTEOF -> true 479 | "p", _ -> false 480 (* optgroup end tag: omit if followed by optgroup start tag, end tag, or EOF *) 481 | "optgroup", NTStartTag "optgroup" -> true 482 | "optgroup", NTEndTag _ -> true 483 | "optgroup", NTEOF -> true 484 | "optgroup", _ -> false 485 (* option end tag: omit if followed by option/optgroup start tag, end tag, or EOF *) 486 | "option", NTStartTag "option" -> true 487 | "option", NTStartTag "optgroup" -> true 488 | "option", NTEndTag _ -> true 489 | "option", NTEOF -> true 490 | "option", _ -> false 491 (* colgroup end tag: omit if not followed by comment, space, or another colgroup *) 492 | "colgroup", NTComment -> false 493 | "colgroup", NTSpace -> false 494 | "colgroup", NTStartTag "colgroup" -> false (* keep end tag when another colgroup follows *) 495 | "colgroup", _ -> true 496 (* thead end tag: omit if followed by tbody or tfoot start tag *) 497 | "thead", NTStartTag "tbody" -> true 498 | "thead", NTStartTag "tfoot" -> true 499 | "thead", _ -> false 500 (* tbody end tag: omit if followed by tbody/tfoot start tag, end tag, or EOF *) 501 | "tbody", NTStartTag "tbody" -> true 502 | "tbody", NTStartTag "tfoot" -> true 503 | "tbody", NTEndTag _ -> true 504 | "tbody", NTEOF -> true 505 | "tbody", _ -> false 506 (* tfoot end tag: omit if followed by tbody start tag, end tag, or EOF *) 507 | "tfoot", NTStartTag "tbody" -> true 508 | "tfoot", NTEndTag _ -> true 509 | "tfoot", NTEOF -> true 510 | "tfoot", _ -> false 511 (* tr end tag: omit if followed by tr start tag, end tag, or EOF *) 512 | "tr", NTStartTag "tr" -> true 513 | "tr", NTEndTag _ -> true 514 | "tr", NTEOF -> true 515 | "tr", _ -> false 516 (* td end tag: omit if followed by td/th start tag, end tag, or EOF *) 517 | "td", NTStartTag "td" -> true 518 | "td", NTStartTag "th" -> true 519 | "td", NTEndTag _ -> true 520 | "td", NTEOF -> true 521 | "td", _ -> false 522 (* th end tag: omit if followed by th/td start tag, end tag, or EOF *) 523 | "th", NTStartTag "th" -> true 524 | "th", NTStartTag "td" -> true 525 | "th", NTEndTag _ -> true 526 | "th", NTEOF -> true 527 | "th", _ -> false 528 | _ -> false 529 530(* Run a single test *) 531let run_test test = 532 try 533 (* Build token infos from input *) 534 let token_infos = Array.of_list (List.map build_token_info test.input) in 535 let num_tokens = Array.length token_infos in 536 537 (* Handle inject_meta_charset option *) 538 let inject_meta = test.options.inject_meta_charset in 539 let encoding = test.options.encoding in 540 541 (* Serialize with context tracking *) 542 let buf = Buffer.create 256 in 543 let in_raw_text = ref false in 544 let preserve_whitespace = ref false in 545 let element_stack : string list ref = ref [] in 546 let in_head = ref false in 547 let meta_charset_injected = ref false in 548 let prev_was_section_end = ref false in (* Track if prev token was thead/tbody/tfoot end *) 549 550 for i = 0 to num_tokens - 1 do 551 let info = token_infos.(i) in 552 let next = classify_next token_infos (i + 1) in 553 554 match info.token with 555 | None -> () 556 557 | Some (StartTag (name, attrs)) -> 558 let name_lower = String.lowercase_ascii name in 559 560 (* Track head element *) 561 if name_lower = "head" then in_head := true; 562 563 (* For inject_meta_charset, we need to check if there's any charset meta coming up *) 564 (* If yes, don't inject at head start; if no, inject at head start *) 565 let should_inject_at_head = 566 if not inject_meta || name_lower <> "head" then false 567 else match encoding with 568 | None -> false 569 | Some _ -> 570 (* Look ahead to see if there's a charset meta or http-equiv content-type *) 571 let has_charset_meta = ref false in 572 for j = i + 1 to num_tokens - 1 do 573 match token_infos.(j).token with 574 | Some (EmptyTag (n, a)) when String.lowercase_ascii n = "meta" -> 575 let has_charset = List.exists (fun (k, _) -> String.lowercase_ascii k = "charset") a in 576 let has_http_equiv_ct = List.exists (fun (k, v) -> 577 String.lowercase_ascii k = "http-equiv" && String.lowercase_ascii v = "content-type") a in 578 if has_charset || has_http_equiv_ct then has_charset_meta := true 579 | Some (EndTag n) when String.lowercase_ascii n = "head" -> () 580 | _ -> () 581 done; 582 not !has_charset_meta 583 in 584 585 (* Special case: tbody start tag cannot be omitted if preceded by section end tag *) 586 let can_omit_start = 587 if name_lower = "tbody" && !prev_was_section_end then false 588 else should_omit_start_tag test.options name attrs next 589 in 590 prev_was_section_end := false; (* Reset for next iteration *) 591 592 if should_inject_at_head then begin 593 meta_charset_injected := true; 594 (* Don't output head start tag if we should omit it *) 595 if not can_omit_start then 596 Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:false name attrs); 597 Buffer.add_string buf (Printf.sprintf "<meta charset=%s>" (Option.get encoding)); 598 element_stack := name :: !element_stack; 599 if is_raw_text_element name then in_raw_text := true; 600 if is_whitespace_preserving_element name then preserve_whitespace := true 601 end else if not can_omit_start then begin 602 Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:false name attrs); 603 element_stack := name :: !element_stack; 604 if is_raw_text_element name then in_raw_text := true; 605 if is_whitespace_preserving_element name then preserve_whitespace := true 606 end else begin 607 element_stack := name :: !element_stack; 608 if is_raw_text_element name then in_raw_text := true; 609 if is_whitespace_preserving_element name then preserve_whitespace := true 610 end 611 612 | Some (EmptyTag (name, attrs)) -> 613 let name_lower = String.lowercase_ascii name in 614 prev_was_section_end := false; (* Reset for next iteration *) 615 616 (* Handle meta charset replacement *) 617 if inject_meta && !in_head && name_lower = "meta" then begin 618 let has_charset = List.exists (fun (k, _) -> String.lowercase_ascii k = "charset") attrs in 619 let has_http_equiv_ct = 620 List.exists (fun (k, v) -> 621 String.lowercase_ascii k = "http-equiv" && 622 String.lowercase_ascii v = "content-type" 623 ) attrs 624 in 625 if has_charset then begin 626 (* Replace charset value *) 627 match encoding with 628 | Some enc -> 629 Buffer.add_string buf (Printf.sprintf "<meta charset=%s>" enc) 630 | None -> () (* No encoding, skip the meta tag *) 631 end else if has_http_equiv_ct then begin 632 (* Replace charset in content value *) 633 match encoding with 634 | Some enc -> 635 let new_attrs = List.map (fun (k, v) -> 636 if String.lowercase_ascii k = "content" then 637 let new_content = Printf.sprintf "text/html; charset=%s" enc in 638 (k, new_content) 639 else (k, v) 640 ) attrs in 641 Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name new_attrs) 642 | None -> 643 Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name attrs) 644 end else begin 645 (* Regular meta tag, output as normal *) 646 Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name attrs) 647 end 648 end else 649 Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name attrs) 650 651 | Some (EndTag name) -> 652 let name_lower = String.lowercase_ascii name in 653 654 (* Track head element *) 655 if name_lower = "head" then in_head := false; 656 657 (* Pop from element stack *) 658 (match !element_stack with 659 | top :: rest when String.lowercase_ascii top = name_lower -> 660 element_stack := rest; 661 if is_raw_text_element name then in_raw_text := false; 662 if is_whitespace_preserving_element name then preserve_whitespace := false 663 | _ -> ()); 664 665 let is_section_end = List.mem name_lower ["thead"; "tbody"; "tfoot"] in 666 let omit = should_omit_end_tag test.options name next in 667 668 if not omit then begin 669 Buffer.add_string buf "</"; 670 Buffer.add_string buf name; 671 Buffer.add_char buf '>' 672 end; 673 674 (* Track if we omitted a section end tag - next tbody can't be omitted *) 675 prev_was_section_end := is_section_end && omit 676 677 | Some (TextNode text) -> 678 prev_was_section_end := false; 679 let processed_text = 680 if !in_raw_text && not test.options.escape_rcdata then 681 text 682 else if test.options.strip_whitespace && not !preserve_whitespace then 683 escape_text (collapse_whitespace text) 684 else 685 escape_text text 686 in 687 Buffer.add_string buf processed_text 688 689 | Some (CommentNode text) -> 690 prev_was_section_end := false; 691 Buffer.add_string buf "<!--"; 692 Buffer.add_string buf text; 693 Buffer.add_string buf "-->" 694 695 | Some (DoctypeNode node) -> 696 prev_was_section_end := false; 697 Buffer.add_string buf (serialize_node test.options ~in_raw_text:false node) 698 done; 699 700 let serialized = Buffer.contents buf in 701 702 (* Check if it matches any expected output *) 703 let matches = List.exists (fun exp -> serialized = exp) test.expected in 704 705 (matches, serialized, test.expected) 706 with e -> 707 (false, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), test.expected) 708 709(* Run all tests in a file *) 710let format_options opts = 711 let parts = [] in 712 let parts = if opts.quote_char_explicit then 713 Printf.sprintf "quote_char='%c'" opts.quote_char :: parts else parts in 714 let parts = if not opts.minimize_boolean_attributes then 715 "minimize_bool=false" :: parts else parts in 716 let parts = if opts.use_trailing_solidus then 717 "trailing_solidus=true" :: parts else parts in 718 let parts = if opts.escape_lt_in_attrs then 719 "escape_lt=true" :: parts else parts in 720 let parts = if opts.escape_rcdata then 721 "escape_rcdata=true" :: parts else parts in 722 let parts = if opts.strip_whitespace then 723 "strip_ws=true" :: parts else parts in 724 let parts = if opts.inject_meta_charset then 725 "inject_charset=true" :: parts else parts in 726 let parts = if not opts.omit_optional_tags then 727 "omit_tags=false" :: parts else parts in 728 if parts = [] then "(defaults)" else String.concat ", " (List.rev parts) 729 730let run_file path = 731 let content = 732 let ic = open_in path in 733 let n = in_channel_length ic in 734 let s = really_input_string ic n in 735 close_in ic; 736 s 737 in 738 739 let json = match Jsont_bytesrw.decode_string Jsont.json content with 740 | Ok j -> j 741 | Error e -> failwith (Printf.sprintf "JSON parse error in %s: %s" path e) 742 in 743 744 let obj = json_object json in 745 let tests_json = match json_mem "tests" obj with 746 | Some t -> json_array t 747 | None -> [] 748 in 749 750 let filename = Filename.basename path in 751 let passed = ref 0 in 752 let failed = ref 0 in 753 let results = ref [] in 754 755 List.iteri (fun i test_json -> 756 try 757 let test = parse_test_case test_json in 758 let (success, actual, expected) = run_test test in 759 760 let result : Report.test_result = { 761 test_num = i + 1; 762 description = test.description; 763 input = String.concat "\n" (List.map (fun tok -> 764 (* Simplified token representation *) 765 match tok with 766 | Jsont.Array (arr, _) -> 767 (match arr with 768 | Jsont.String (ty, _) :: rest -> 769 Printf.sprintf "%s: %s" ty (String.concat ", " (List.map (function 770 | Jsont.String (s, _) -> Printf.sprintf "%S" s 771 | Jsont.Object _ -> "{...}" 772 | Jsont.Null _ -> "null" 773 | _ -> "?" 774 ) rest)) 775 | _ -> "?") 776 | _ -> "?" 777 ) test.input); 778 expected = String.concat " | " expected; 779 actual; 780 success; 781 details = [ 782 ("Options", format_options test.options); 783 ("Expected Variants", string_of_int (List.length expected)); 784 ]; 785 raw_test_data = Some test.raw_json; 786 } in 787 results := result :: !results; 788 789 if success then incr passed else incr failed 790 with e -> 791 incr failed; 792 let result : Report.test_result = { 793 test_num = i + 1; 794 description = Printf.sprintf "Test %d" (i + 1); 795 input = ""; 796 expected = ""; 797 actual = Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e); 798 success = false; 799 details = []; 800 raw_test_data = Some (json_to_string test_json); 801 } in 802 results := result :: !results; 803 Printf.eprintf "Exception parsing test %d: %s\n" (i + 1) (Printexc.to_string e) 804 ) tests_json; 805 806 let file_result : Report.file_result = { 807 filename; 808 test_type = "Serializer"; 809 passed_count = !passed; 810 failed_count = !failed; 811 tests = List.rev !results; 812 } in 813 (file_result, !passed, !failed) 814 815let () = 816 let test_dir = Sys.argv.(1) in 817 let files = Sys.readdir test_dir |> Array.to_list in 818 let test_files = List.filter (fun f -> Filename.check_suffix f ".test") files in 819 820 let total_passed = ref 0 in 821 let total_failed = ref 0 in 822 let file_results = ref [] in 823 824 List.iter (fun file -> 825 let path = Filename.concat test_dir file in 826 let (file_result, passed, failed) = run_file path in 827 total_passed := !total_passed + passed; 828 total_failed := !total_failed + failed; 829 file_results := file_result :: !file_results; 830 Printf.printf "%s: %d passed, %d failed\n" file passed failed 831 ) (List.sort String.compare test_files); 832 833 Printf.printf "\n=== Summary ===\n"; 834 Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed; 835 836 (* Generate HTML report *) 837 let report : Report.report = { 838 title = "HTML5 Serializer Tests"; 839 test_type = "serializer"; 840 description = "These tests validate the HTML serialization algorithm for converting DOM trees back to HTML text. \ 841 Each test provides a sequence of tokens (start tags, end tags, text, comments, doctypes) and one \ 842 or more valid serialized outputs. Tests cover attribute quoting, boolean attribute minimization, \ 843 self-closing tag syntax (trailing solidus), entity escaping, whitespace handling, meta charset \ 844 injection, and optional tag omission rules as specified in the HTML Standard. Multiple expected \ 845 outputs allow for valid variations in serialization style."; 846 files = List.rev !file_results; 847 total_passed = !total_passed; 848 total_failed = !total_failed; 849 } in 850 Report.generate_report report "test_serializer_report.html"; 851 852 exit (if !total_failed > 0 then 1 else 0)