OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* HTML5 Tree Builder *) 2 3module Dom = Dom 4module Token = Tokenizer_token 5module State = Tokenizer_state 6 7type fragment_context = { 8 tag_name : string; 9 namespace : string option; 10} 11 12type formatting_entry = 13 | Marker 14 | Entry of { 15 name : string; 16 attrs : (string * string) list; 17 node : Dom.node; 18 } 19 20type parse_error = { 21 code : Parse_error_code.t; 22 line : int; 23 column : int; 24} 25 26type t = { 27 mutable document : Dom.node; 28 mutable mode : Parser_insertion_mode.t; 29 mutable original_mode : Parser_insertion_mode.t option; 30 mutable open_elements : Dom.node list; 31 mutable active_formatting : formatting_entry list; 32 mutable head_element : Dom.node option; 33 mutable form_element : Dom.node option; 34 mutable frameset_ok : bool; 35 mutable ignore_lf : bool; 36 mutable foster_parenting : bool; 37 mutable pending_table_chars : string list; 38 mutable template_modes : Parser_insertion_mode.t list; 39 mutable quirks_mode : Dom.quirks_mode; 40 mutable errors : parse_error list; 41 collect_errors : bool; 42 fragment_context : fragment_context option; 43 mutable fragment_context_element : Dom.node option; 44 iframe_srcdoc : bool; 45 mutable current_line : int; 46 mutable current_column : int; 47} 48 49let create ?(collect_errors=false) ?fragment_context ?(iframe_srcdoc=false) () = 50 let is_fragment = fragment_context <> None in 51 let doc = if is_fragment then Dom.create_document_fragment () else Dom.create_document () in 52 let t = { 53 document = doc; 54 mode = Parser_insertion_mode.Initial; 55 original_mode = None; 56 open_elements = []; 57 active_formatting = []; 58 head_element = None; 59 form_element = None; 60 frameset_ok = true; 61 ignore_lf = false; 62 foster_parenting = false; 63 pending_table_chars = []; 64 template_modes = []; 65 quirks_mode = Dom.No_quirks; 66 errors = []; 67 collect_errors; 68 fragment_context; 69 fragment_context_element = None; 70 iframe_srcdoc; 71 current_line = 1; 72 current_column = 1; 73 } in 74 (* Initialize fragment parsing *) 75 (match fragment_context with 76 | Some ctx -> 77 let name = String.lowercase_ascii ctx.tag_name in 78 let ns = ctx.namespace in 79 (* Create html root *) 80 let root = Dom.create_element "html" () in 81 Dom.append_child doc root; 82 t.open_elements <- [root]; 83 (* For foreign content contexts, create context element *) 84 (match ns with 85 | Some namespace when namespace <> "html" -> 86 let context_elem = Dom.create_element ctx.tag_name ~namespace:ns () in 87 Dom.append_child root context_elem; 88 t.open_elements <- [context_elem; root]; 89 t.fragment_context_element <- Some context_elem 90 | _ -> ()); 91 (* Set initial mode based on context *) 92 t.mode <- ( 93 if name = "html" then Parser_insertion_mode.Before_head 94 else if Parser_constants.is_table_section_element name && (ns = None || ns = Some "html") then 95 Parser_insertion_mode.In_table_body 96 else if name = "tr" && (ns = None || ns = Some "html") then 97 Parser_insertion_mode.In_row 98 else if Parser_constants.is_table_cell_element name && (ns = None || ns = Some "html") then 99 Parser_insertion_mode.In_cell 100 else if name = "caption" && (ns = None || ns = Some "html") then 101 Parser_insertion_mode.In_caption 102 else if name = "colgroup" && (ns = None || ns = Some "html") then 103 Parser_insertion_mode.In_column_group 104 else if name = "table" && (ns = None || ns = Some "html") then 105 Parser_insertion_mode.In_table 106 else if name = "template" && (ns = None || ns = Some "html") then begin 107 t.template_modes <- [Parser_insertion_mode.In_template]; 108 Parser_insertion_mode.In_template 109 end 110 else 111 Parser_insertion_mode.In_body 112 ); 113 t.frameset_ok <- false 114 | None -> ()); 115 t 116 117(* Position tracking for error reporting *) 118let set_position t ~line ~column = 119 t.current_line <- line; 120 t.current_column <- column 121 122(* Error handling *) 123let parse_error t code = 124 if t.collect_errors then 125 t.errors <- { code = Parse_error_code.of_string code; line = t.current_line; column = t.current_column } :: t.errors 126 127(* Stack helpers *) 128let current_node t = 129 match t.open_elements with 130 | [] -> None 131 | x :: _ -> Some x 132 133let adjusted_current_node t = 134 match t.fragment_context, t.open_elements with 135 | Some ctx, [_] -> 136 (* Fragment case: use context element info *) 137 Some (Dom.create_element ctx.tag_name ~namespace:ctx.namespace ()) 138 | _, x :: _ -> Some x 139 | _, [] -> None 140 141let is_in_html_namespace node = 142 node.Dom.namespace = None || node.Dom.namespace = Some "html" 143 144(* Namespace-aware check for "special" elements per WHATWG spec *) 145let is_special_element node = 146 let name = String.lowercase_ascii node.Dom.name in 147 match node.Dom.namespace with 148 | None | Some "html" -> Parser_constants.is_special name 149 | Some "mathml" -> List.mem name ["mi"; "mo"; "mn"; "ms"; "mtext"; "annotation-xml"] 150 | Some "svg" -> List.mem name ["foreignobject"; "desc"; "title"] 151 | _ -> false 152 153let adjusted_current_node_in_html_namespace t = 154 match adjusted_current_node t with 155 | Some node -> is_in_html_namespace node 156 | None -> true 157 158(* Insertion helpers *) 159let appropriate_insertion_place t = 160 match current_node t with 161 | None -> (t.document, None) 162 | Some target -> 163 if t.foster_parenting && Parser_constants.is_foster_parenting_element target.Dom.name then begin 164 (* Foster parenting per WHATWG spec *) 165 (* Step 1: Find last (most recent) template and table in stack *) 166 (* Note: index 0 = top of stack = most recently added *) 167 let last_template_idx = ref None in 168 let last_table_idx = ref None in 169 List.iteri (fun i n -> 170 (* Take first match (most recent = lowest index) *) 171 if n.Dom.name = "template" && !last_template_idx = None then last_template_idx := Some i; 172 if n.Dom.name = "table" && !last_table_idx = None then last_table_idx := Some i 173 ) t.open_elements; 174 175 (* Step 2-3: If last template is more recent than last table (lower index = more recent) *) 176 match !last_template_idx, !last_table_idx with 177 | Some ti, None -> 178 (* No table, use template content *) 179 let template = List.nth t.open_elements ti in 180 (match template.Dom.template_content with 181 | Some tc -> (tc, None) 182 | None -> (template, None)) 183 | Some ti, Some tbi when ti < tbi -> 184 (* Template is more recent than table, use template content *) 185 let template = List.nth t.open_elements ti in 186 (match template.Dom.template_content with 187 | Some tc -> (tc, None) 188 | None -> (template, None)) 189 | _, Some tbi -> 190 (* Use table's parent as foster parent *) 191 let table = List.nth t.open_elements tbi in 192 (match table.Dom.parent with 193 | Some parent -> (parent, Some table) 194 | None -> 195 (* Step 6: element above table in stack (index + 1 since 0 is top) *) 196 if tbi + 1 < List.length t.open_elements then 197 (List.nth t.open_elements (tbi + 1), None) 198 else 199 (t.document, None)) 200 | None, None -> 201 (* No table or template, use document *) 202 (t.document, None) 203 end else begin 204 (* If target is a template, insert into its content document fragment *) 205 match target.Dom.template_content with 206 | Some tc -> (tc, None) 207 | None -> (target, None) 208 end 209 210let insert_element t name ?(namespace=None) ?(push=false) attrs = 211 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in 212 let node = Dom.create_element name ~namespace ~attrs ~location () in 213 let (parent, before) = appropriate_insertion_place t in 214 (match before with 215 | None -> Dom.append_child parent node 216 | Some ref -> Dom.insert_before parent node ref); 217 if push then t.open_elements <- node :: t.open_elements; 218 node 219 220let insert_element_for_token t (tag : Token.tag) = 221 insert_element t tag.name ~push:true tag.attrs 222 223let insert_foreign_element t (tag : Token.tag) namespace = 224 let attrs = 225 if namespace = Some "svg" then 226 Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs tag.attrs) 227 else 228 Parser_constants.adjust_foreign_attrs tag.attrs 229 in 230 let name = 231 if namespace = Some "svg" then Parser_constants.adjust_svg_tag_name tag.name 232 else tag.name 233 in 234 let node = insert_element t name ~namespace attrs in 235 t.open_elements <- node :: t.open_elements; 236 node 237 238let insert_character t data = 239 if t.ignore_lf && String.length data > 0 && data.[0] = '\n' then begin 240 t.ignore_lf <- false; 241 if String.length data > 1 then begin 242 let rest = String.sub data 1 (String.length data - 1) in 243 let (parent, before) = appropriate_insertion_place t in 244 Dom.insert_text_at parent rest before 245 end 246 end else begin 247 t.ignore_lf <- false; 248 let (parent, before) = appropriate_insertion_place t in 249 Dom.insert_text_at parent data before 250 end 251 252let insert_comment t data = 253 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in 254 let node = Dom.create_comment ~location data in 255 let (parent, _) = appropriate_insertion_place t in 256 Dom.append_child parent node 257 258let insert_comment_to_document t data = 259 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in 260 let node = Dom.create_comment ~location data in 261 Dom.append_child t.document node 262 263(* Stack manipulation *) 264let pop_current t = 265 match t.open_elements with 266 | [] -> () 267 | _ :: rest -> t.open_elements <- rest 268 269let pop_until t pred = 270 let rec loop () = 271 match t.open_elements with 272 | [] -> () 273 | x :: rest -> 274 t.open_elements <- rest; 275 if not (pred x) then loop () 276 in 277 loop () 278 279let pop_until_tag t name = 280 pop_until t (fun n -> n.Dom.name = name) 281 282(* Pop until HTML namespace element with given name *) 283let pop_until_html_tag t name = 284 pop_until t (fun n -> n.Dom.name = name && is_in_html_namespace n) 285 286let pop_until_one_of t names = 287 pop_until t (fun n -> List.mem n.Dom.name names) 288 289(* Pop until HTML namespace element with one of given names *) 290let pop_until_html_one_of t names = 291 pop_until t (fun n -> List.mem n.Dom.name names && is_in_html_namespace n) 292 293(* Check if element is an HTML integration point *) 294let is_html_integration_point node = 295 (* SVG foreignObject, desc, and title are always HTML integration points *) 296 if node.Dom.namespace = Some "svg" && 297 Parser_constants.is_svg_html_integration node.Dom.name then true 298 (* annotation-xml is an HTML integration point only with specific encoding values *) 299 else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then 300 match List.assoc_opt "encoding" node.Dom.attrs with 301 | Some enc -> 302 let enc_lower = String.lowercase_ascii enc in 303 enc_lower = "text/html" || enc_lower = "application/xhtml+xml" 304 | None -> false 305 else false 306 307(* Check if element is a MathML text integration point *) 308let is_mathml_text_integration_point node = 309 node.Dom.namespace = Some "mathml" && 310 Parser_constants.is_mathml_text_integration node.Dom.name 311 312(* Scope checks - integration points also terminate scope (except for table scope) *) 313(* Per WHATWG spec, scope checks only consider HTML namespace elements for the target names *) 314let has_element_in_scope_impl t names exclude_list ~check_integration_points = 315 let rec check = function 316 | [] -> false 317 | n :: rest -> 318 (* Target elements must be in HTML namespace *) 319 if is_in_html_namespace n && List.mem n.Dom.name names then true 320 else if is_in_html_namespace n && List.mem n.Dom.name exclude_list then false 321 (* Integration points terminate scope (unless we're checking table scope) *) 322 else if check_integration_points && (is_html_integration_point n || is_mathml_text_integration_point n) then false 323 else check rest 324 in 325 check t.open_elements 326 327let has_element_in_scope t name = 328 has_element_in_scope_impl t [name] Parser_constants.default_scope ~check_integration_points:true 329 330let has_element_in_button_scope t name = 331 has_element_in_scope_impl t [name] Parser_constants.button_scope ~check_integration_points:true 332 333let has_element_in_list_item_scope t name = 334 has_element_in_scope_impl t [name] Parser_constants.list_item_scope ~check_integration_points:true 335 336let has_element_in_table_scope t name = 337 has_element_in_scope_impl t [name] Parser_constants.table_scope ~check_integration_points:false 338 339let has_element_in_select_scope t name = 340 let rec check = function 341 | [] -> false 342 | n :: rest -> 343 if n.Dom.name = name then true 344 else if not (Parser_constants.is_select_scope_exclude n.Dom.name) then false 345 else check rest 346 in 347 check t.open_elements 348 349(* Implied end tags *) 350let generate_implied_end_tags t ?except () = 351 let rec loop () = 352 match current_node t with 353 | Some n when Parser_constants.is_implied_end_tag n.Dom.name -> 354 (match except with 355 | Some ex when n.Dom.name = ex -> () 356 | _ -> pop_current t; loop ()) 357 | _ -> () 358 in 359 loop () 360 361let generate_all_implied_end_tags t = 362 let rec loop () = 363 match current_node t with 364 | Some n when Parser_constants.is_thoroughly_implied_end_tag n.Dom.name -> 365 pop_current t; loop () 366 | _ -> () 367 in 368 loop () 369 370(* Active formatting elements *) 371let push_formatting_marker t = 372 t.active_formatting <- Marker :: t.active_formatting 373 374let push_formatting_element t node name attrs = 375 (* Noah's Ark: remove earlier identical elements (up to 3) *) 376 let rec count_and_remove same acc = function 377 | [] -> List.rev acc 378 | Marker :: rest -> List.rev acc @ (Marker :: rest) 379 | Entry e :: rest when e.name = name && e.attrs = attrs -> 380 if same >= 2 then 381 count_and_remove same acc rest (* Remove this one *) 382 else 383 count_and_remove (same + 1) (Entry e :: acc) rest 384 | x :: rest -> count_and_remove same (x :: acc) rest 385 in 386 t.active_formatting <- count_and_remove 0 [] t.active_formatting; 387 t.active_formatting <- Entry { name; attrs; node } :: t.active_formatting 388 389let clear_active_formatting_to_marker t = 390 let rec loop = function 391 | [] -> [] 392 | Marker :: rest -> rest 393 | _ :: rest -> loop rest 394 in 395 t.active_formatting <- loop t.active_formatting 396 397let reconstruct_active_formatting t = 398 let rec find_to_reconstruct acc = function 399 | [] -> acc 400 | Marker :: _ -> acc 401 | Entry e :: rest -> 402 if List.exists (fun n -> n == e.node) t.open_elements then acc 403 else find_to_reconstruct (Entry e :: acc) rest 404 in 405 let to_reconstruct = find_to_reconstruct [] t.active_formatting in 406 List.iter (fun entry -> 407 match entry with 408 | Entry e -> 409 let node = insert_element t e.name e.attrs in 410 t.open_elements <- node :: t.open_elements; 411 (* Update the entry to point to new node *) 412 t.active_formatting <- List.map (fun x -> 413 if x == entry then Entry { e with node } 414 else x 415 ) t.active_formatting 416 | Marker -> () 417 ) to_reconstruct 418 419(* Adoption agency algorithm - follows WHATWG spec *) 420let adoption_agency t tag_name = 421 (* Step 1: If current node is subject and not in active formatting list, just pop *) 422 (match current_node t with 423 | Some n when n.Dom.name = tag_name -> 424 let in_active = List.exists (function 425 | Entry e -> e.name = tag_name 426 | Marker -> false 427 ) t.active_formatting in 428 if not in_active then begin 429 pop_current t; 430 () (* Return early - this case is handled *) 431 end 432 | _ -> ()); 433 434 (* Step 2: Outer loop *) 435 let outer_loop_counter = ref 0 in 436 let done_flag = ref false in 437 438 while !outer_loop_counter < 8 && not !done_flag do 439 incr outer_loop_counter; 440 441 (* Step 3: Find formatting element in active formatting list *) 442 let rec find_formatting_index idx = function 443 | [] -> None 444 | Marker :: _ -> None 445 | Entry e :: rest -> 446 if e.name = tag_name then Some (idx, e.node, e.attrs) 447 else find_formatting_index (idx + 1) rest 448 in 449 let formatting_entry = find_formatting_index 0 t.active_formatting in 450 451 match formatting_entry with 452 | None -> 453 (* No formatting element found - done *) 454 done_flag := true 455 | Some (fmt_idx, fmt_node, fmt_attrs) -> 456 457 (* Step 4: Check if formatting element is in open elements *) 458 if not (List.exists (fun n -> n == fmt_node) t.open_elements) then begin 459 parse_error t "adoption-agency-1.2"; 460 t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting; 461 done_flag := true 462 end 463 (* Step 5: Check if formatting element is in scope *) 464 else if not (has_element_in_scope t tag_name) then begin 465 parse_error t "adoption-agency-1.3"; 466 done_flag := true 467 end else begin 468 (* Step 6: Parse error if not current node *) 469 (match current_node t with 470 | Some n when n != fmt_node -> parse_error t "adoption-agency-1.3" 471 | _ -> ()); 472 473 (* Step 7: Find furthest block - first special element BELOW formatting element *) 474 (* open_elements is [current(top)...html(bottom)], formatting element is somewhere in the middle *) 475 (* We need the first special element going from formatting element toward current *) 476 (* This is the "topmost" (closest to formatting element) special element that is "lower" (closer to current) *) 477 let fmt_stack_idx = ref (-1) in 478 List.iteri (fun i n -> if n == fmt_node then fmt_stack_idx := i) t.open_elements; 479 let furthest_block = 480 if !fmt_stack_idx <= 0 then None 481 else begin 482 (* Look from fmt_stack_idx-1 down to 0, find first special element *) 483 let rec find_from_idx idx = 484 if idx < 0 then None 485 else 486 let n = List.nth t.open_elements idx in 487 if is_special_element n then Some n 488 else find_from_idx (idx - 1) 489 in 490 find_from_idx (!fmt_stack_idx - 1) 491 end 492 in 493 494 match furthest_block with 495 | None -> 496 (* Step 8: No furthest block - pop elements including formatting element *) 497 pop_until t (fun n -> n == fmt_node); 498 t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting; 499 done_flag := true 500 501 | Some fb -> 502 (* Step 9: Let common ancestor be element immediately above formatting element *) 503 let rec find_common_ancestor = function 504 | [] -> None 505 | n :: rest when n == fmt_node -> 506 (match rest with x :: _ -> Some x | [] -> None) 507 | _ :: rest -> find_common_ancestor rest 508 in 509 let common_ancestor = find_common_ancestor t.open_elements in 510 511 (* Step 10: Bookmark starts after formatting element *) 512 let bookmark = ref (fmt_idx + 1) in 513 514 (* Step 11: Let last_node = furthest block *) 515 let last_node = ref fb in 516 517 (* Step 12: Inner loop *) 518 (* The inner loop processes elements between furthest_block and formatting_element, 519 removing non-formatting elements and reparenting formatting elements *) 520 let inner_loop_counter = ref 0 in 521 522 (* Get index of furthest block in open elements *) 523 let fb_idx = ref 0 in 524 List.iteri (fun i n -> if n == fb then fb_idx := i) t.open_elements; 525 526 (* Start from element after furthest block (toward formatting element) *) 527 let node_idx = ref (!fb_idx + 1) in 528 529 while !node_idx < List.length t.open_elements && 530 (List.nth t.open_elements !node_idx) != fmt_node do 531 incr inner_loop_counter; 532 let current_node = List.nth t.open_elements !node_idx in 533 534 (* Step 12.3: Find node in active formatting list *) 535 let rec find_node_in_formatting idx = function 536 | [] -> None 537 | Entry e :: _rest when e.node == current_node -> Some idx 538 | _ :: rest -> find_node_in_formatting (idx + 1) rest 539 in 540 let node_fmt_idx = find_node_in_formatting 0 t.active_formatting in 541 542 (* Step 12.4: If inner loop counter > 3 and node in active formatting, remove it *) 543 let node_fmt_idx = 544 match node_fmt_idx with 545 | Some idx when !inner_loop_counter > 3 -> 546 t.active_formatting <- List.filteri (fun i _ -> i <> idx) t.active_formatting; 547 if idx < !bookmark then decr bookmark; 548 None 549 | x -> x 550 in 551 552 (* Step 12.5: If node not in active formatting, remove from stack and continue *) 553 match node_fmt_idx with 554 | None -> 555 (* Remove from stack - this shifts indices *) 556 t.open_elements <- List.filteri (fun i _ -> i <> !node_idx) t.open_elements 557 (* Don't increment node_idx since we removed an element *) 558 559 | Some af_idx -> 560 (* Step 12.6: Create new element for node *) 561 let (node_name, node_attrs) = match List.nth t.active_formatting af_idx with 562 | Entry e -> (e.name, e.attrs) 563 | Marker -> failwith "unexpected marker" 564 in 565 let new_node_elem = Dom.create_element node_name ~attrs:node_attrs () in 566 567 (* Update active formatting with new node *) 568 t.active_formatting <- List.mapi (fun i entry -> 569 if i = af_idx then Entry { name = node_name; node = new_node_elem; attrs = node_attrs } 570 else entry 571 ) t.active_formatting; 572 573 (* Replace node in open elements *) 574 t.open_elements <- List.mapi (fun i n -> 575 if i = !node_idx then new_node_elem else n 576 ) t.open_elements; 577 578 (* Step 12.7: If last_node is furthest block, update bookmark *) 579 if !last_node == fb then 580 bookmark := af_idx + 1; 581 582 (* Step 12.8: Reparent last_node to new node *) 583 (match !last_node.Dom.parent with 584 | Some p -> Dom.remove_child p !last_node 585 | None -> ()); 586 Dom.append_child new_node_elem !last_node; 587 588 (* Step 12.9: Let last_node = new node *) 589 last_node := new_node_elem; 590 591 (* Move to next element *) 592 incr node_idx 593 done; 594 595 (* Step 13: Insert last_node into common ancestor *) 596 (match common_ancestor with 597 | Some ca -> 598 (match !last_node.Dom.parent with 599 | Some p -> Dom.remove_child p !last_node 600 | None -> ()); 601 (* Check if we need foster parenting *) 602 if t.foster_parenting && Parser_constants.is_foster_parenting_element ca.Dom.name then begin 603 (* Find table and insert before it *) 604 let rec find_table = function 605 | [] -> None 606 | n :: rest when n.Dom.name = "table" -> Some (n, rest) 607 | _ :: rest -> find_table rest 608 in 609 match find_table t.open_elements with 610 | Some (table, _) -> 611 (match table.Dom.parent with 612 | Some parent -> Dom.insert_before parent !last_node table 613 | None -> Dom.append_child ca !last_node) 614 | None -> Dom.append_child ca !last_node 615 end else begin 616 (* If common ancestor is template, insert into its content *) 617 match ca.Dom.template_content with 618 | Some tc -> Dom.append_child tc !last_node 619 | None -> Dom.append_child ca !last_node 620 end 621 | None -> ()); 622 623 (* Step 14: Create new formatting element *) 624 let new_formatting = Dom.create_element tag_name ~attrs:fmt_attrs () in 625 626 (* Step 15: Move children of furthest block to new formatting element *) 627 let fb_children = fb.Dom.children in 628 List.iter (fun child -> 629 Dom.remove_child fb child; 630 Dom.append_child new_formatting child 631 ) fb_children; 632 633 (* Step 16: Append new formatting element to furthest block *) 634 Dom.append_child fb new_formatting; 635 636 (* Step 17: Remove old from active formatting, insert new at bookmark *) 637 let new_entry = Entry { name = tag_name; node = new_formatting; attrs = fmt_attrs } in 638 t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting; 639 (* Adjust bookmark since we removed an element *) 640 let adjusted_bookmark = if fmt_idx < !bookmark then !bookmark - 1 else !bookmark in 641 let rec insert_at_bookmark idx acc = function 642 | [] -> List.rev (new_entry :: acc) 643 | x :: rest when idx = adjusted_bookmark -> 644 List.rev_append acc (new_entry :: x :: rest) 645 | x :: rest -> insert_at_bookmark (idx + 1) (x :: acc) rest 646 in 647 t.active_formatting <- insert_at_bookmark 0 [] t.active_formatting; 648 649 (* Step 18: Remove formatting element from open elements, insert new after furthest block *) 650 (* "After" in stack terms means new_formatting should be between fb and current node *) 651 (* In our list orientation (current at index 0), this means new_formatting at lower index than fb *) 652 t.open_elements <- List.filter (fun n -> n != fmt_node) t.open_elements; 653 (* Find fb and insert new_formatting before it (lower index = closer to current) *) 654 let rec insert_before acc = function 655 | [] -> List.rev (new_formatting :: acc) 656 | n :: rest when n == fb -> 657 (* Insert new_formatting before fb: acc reversed, then new_formatting, then fb, then rest *) 658 List.rev_append acc (new_formatting :: n :: rest) 659 | n :: rest -> insert_before (n :: acc) rest 660 in 661 t.open_elements <- insert_before [] t.open_elements 662 (* Continue outer loop *) 663 end 664 done 665 666(* Close p element *) 667let close_p_element t = 668 generate_implied_end_tags t ~except:"p" (); 669 (match current_node t with 670 | Some n when n.Dom.name <> "p" -> parse_error t "end-tag-p-implied-but-open-elements" 671 | _ -> ()); 672 pop_until_tag t "p" 673 674(* Reset insertion mode *) 675let reset_insertion_mode t = 676 let rec check_node last = function 677 | [] -> t.mode <- Parser_insertion_mode.In_body 678 | node :: rest -> 679 let is_last = rest = [] in 680 let node_to_check = 681 if is_last then 682 match t.fragment_context with 683 | Some ctx -> Dom.create_element ctx.tag_name ~namespace:ctx.namespace () 684 | None -> node 685 else node 686 in 687 let name = node_to_check.Dom.name in 688 if name = "select" then begin 689 if not is_last then begin 690 let rec find_table_or_template = function 691 | [] -> () 692 | n :: rest -> 693 if n.Dom.name = "template" then t.mode <- Parser_insertion_mode.In_select 694 else if n.Dom.name = "table" then t.mode <- Parser_insertion_mode.In_select_in_table 695 else find_table_or_template rest 696 in 697 find_table_or_template rest 698 end; 699 if t.mode <> Parser_insertion_mode.In_select_in_table then 700 t.mode <- Parser_insertion_mode.In_select 701 end else if Parser_constants.is_table_cell_element name && not is_last then 702 t.mode <- Parser_insertion_mode.In_cell 703 else if name = "tr" then 704 t.mode <- Parser_insertion_mode.In_row 705 else if Parser_constants.is_table_section_element name then 706 t.mode <- Parser_insertion_mode.In_table_body 707 else if name = "caption" then 708 t.mode <- Parser_insertion_mode.In_caption 709 else if name = "colgroup" then 710 t.mode <- Parser_insertion_mode.In_column_group 711 else if name = "table" then 712 t.mode <- Parser_insertion_mode.In_table 713 else if name = "template" then 714 t.mode <- (match t.template_modes with m :: _ -> m | [] -> Parser_insertion_mode.In_template) 715 else if name = "head" && not is_last then 716 t.mode <- Parser_insertion_mode.In_head 717 else if name = "body" then 718 t.mode <- Parser_insertion_mode.In_body 719 else if name = "frameset" then 720 t.mode <- Parser_insertion_mode.In_frameset 721 else if name = "html" then 722 t.mode <- (if t.head_element = None then Parser_insertion_mode.Before_head else Parser_insertion_mode.After_head) 723 else if is_last then 724 t.mode <- Parser_insertion_mode.In_body 725 else 726 check_node last rest 727 in 728 check_node false t.open_elements 729 730let is_whitespace s = 731 let ws = [' '; '\t'; '\n'; '\x0C'; '\r'] in 732 String.for_all (fun c -> List.mem c ws) s 733 734(* Mode handlers *) 735let rec process_initial t token = 736 match token with 737 | Token.Character data when is_whitespace data -> () 738 | Token.Comment data -> insert_comment_to_document t data 739 | Token.Doctype dt -> 740 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in 741 let node = Dom.create_doctype ?name:dt.name ?public_id:dt.public_id ?system_id:dt.system_id ~location () in 742 Dom.append_child t.document node; 743 (* Quirks mode detection *) 744 if dt.force_quirks then 745 t.quirks_mode <- Dom.Quirks 746 else if dt.name <> Some "html" then 747 t.quirks_mode <- Dom.Quirks 748 else begin 749 let pub = Option.map String.lowercase_ascii dt.public_id in 750 let sys = Option.map String.lowercase_ascii dt.system_id in 751 let is_quirky = 752 (match pub with 753 | Some p -> List.mem p Parser_constants.quirky_public_matches || 754 List.exists (fun prefix -> String.length p >= String.length prefix && 755 String.sub p 0 (String.length prefix) = prefix) Parser_constants.quirky_public_prefixes 756 | None -> false) || 757 (match sys with 758 | Some s -> List.mem s Parser_constants.quirky_system_matches 759 | None -> false) 760 in 761 if is_quirky then t.quirks_mode <- Dom.Quirks 762 else begin 763 let is_limited_quirky = 764 match pub with 765 | Some p -> List.exists (fun prefix -> String.length p >= String.length prefix && 766 String.sub p 0 (String.length prefix) = prefix) 767 Parser_constants.limited_quirky_public_prefixes 768 | None -> false 769 in 770 if is_limited_quirky then t.quirks_mode <- Dom.Limited_quirks 771 end 772 end; 773 t.mode <- Parser_insertion_mode.Before_html 774 | _ -> 775 parse_error t "expected-doctype-but-got-other"; 776 t.quirks_mode <- Dom.Quirks; 777 t.mode <- Parser_insertion_mode.Before_html; 778 process_token t token 779 780and process_before_html t token = 781 match token with 782 | Token.Doctype _ -> parse_error t "unexpected-doctype" 783 | Token.Comment data -> insert_comment_to_document t data 784 | Token.Character data when is_whitespace data -> () 785 | Token.Tag { kind = Token.Start; name = "html"; attrs; _ } -> 786 let html = insert_element t "html" attrs in 787 t.open_elements <- [html]; 788 t.mode <- Parser_insertion_mode.Before_head 789 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["head"; "body"; "html"; "br"] -> 790 let html = insert_element t "html" [] in 791 t.open_elements <- [html]; 792 t.mode <- Parser_insertion_mode.Before_head; 793 process_token t token 794 | Token.Tag { kind = Token.End; name; _ } -> 795 parse_error t ("unexpected-end-tag:" ^ name) 796 | _ -> 797 let html = insert_element t "html" [] in 798 t.open_elements <- [html]; 799 t.mode <- Parser_insertion_mode.Before_head; 800 process_token t token 801 802and process_before_head t token = 803 match token with 804 | Token.Character data when is_whitespace data -> () 805 | Token.Comment data -> insert_comment t data 806 | Token.Doctype _ -> parse_error t "unexpected-doctype" 807 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 808 process_in_body t token 809 | Token.Tag { kind = Token.Start; name = "head"; attrs; _ } -> 810 let head = insert_element t "head" attrs in 811 t.open_elements <- head :: t.open_elements; 812 t.head_element <- Some head; 813 t.mode <- Parser_insertion_mode.In_head 814 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["head"; "body"; "html"; "br"] -> 815 let head = insert_element t "head" [] in 816 t.open_elements <- head :: t.open_elements; 817 t.head_element <- Some head; 818 t.mode <- Parser_insertion_mode.In_head; 819 process_token t token 820 | Token.Tag { kind = Token.End; name; _ } -> 821 parse_error t ("unexpected-end-tag:" ^ name) 822 | _ -> 823 let head = insert_element t "head" [] in 824 t.open_elements <- head :: t.open_elements; 825 t.head_element <- Some head; 826 t.mode <- Parser_insertion_mode.In_head; 827 process_token t token 828 829and process_in_head t token = 830 match token with 831 | Token.Character data when is_whitespace data -> 832 insert_character t data 833 | Token.Character data -> 834 (* Extract leading whitespace *) 835 let rec count_leading_ws i = 836 if i >= String.length data then i 837 else match data.[i] with 838 | '\t' | '\n' | '\x0C' | '\r' | ' ' -> count_leading_ws (i + 1) 839 | _ -> i 840 in 841 let ws_count = count_leading_ws 0 in 842 let leading_ws = String.sub data 0 ws_count in 843 let remaining = String.sub data ws_count (String.length data - ws_count) in 844 (* If there's leading whitespace and current element has children, insert it *) 845 if leading_ws <> "" then 846 (match current_node t with 847 | Some n when n.Dom.children <> [] -> insert_character t leading_ws 848 | _ -> ()); 849 pop_current t; 850 t.mode <- Parser_insertion_mode.After_head; 851 process_token t (Token.Character remaining) 852 | Token.Comment data -> 853 insert_comment t data 854 | Token.Doctype _ -> 855 parse_error t "unexpected-doctype" 856 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 857 process_in_body t token 858 | Token.Tag { kind = Token.Start; name; attrs; _ } 859 when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"] -> 860 ignore (insert_element t name attrs) 861 | Token.Tag { kind = Token.Start; name = "title"; attrs; self_closing } -> 862 ignore (insert_element_for_token t { kind = Token.Start; name = "title"; attrs; self_closing }); 863 t.original_mode <- Some t.mode; 864 t.mode <- Parser_insertion_mode.Text 865 | Token.Tag { kind = Token.Start; name; attrs; self_closing } 866 when List.mem name ["noframes"; "style"] -> 867 ignore (insert_element_for_token t { kind = Token.Start; name; attrs; self_closing }); 868 t.original_mode <- Some t.mode; 869 t.mode <- Parser_insertion_mode.Text 870 | Token.Tag { kind = Token.Start; name = "noscript"; attrs; self_closing } -> 871 (* Scripting is disabled: parse noscript content as HTML *) 872 ignore (insert_element_for_token t { kind = Token.Start; name = "noscript"; attrs; self_closing }); 873 t.mode <- Parser_insertion_mode.In_head_noscript 874 | Token.Tag { kind = Token.Start; name = "script"; attrs; self_closing } -> 875 ignore (insert_element_for_token t { kind = Token.Start; name = "script"; attrs; self_closing }); 876 t.original_mode <- Some t.mode; 877 t.mode <- Parser_insertion_mode.Text 878 | Token.Tag { kind = Token.End; name = "head"; _ } -> 879 pop_current t; 880 t.mode <- Parser_insertion_mode.After_head 881 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "html"; "br"] -> 882 pop_current t; 883 t.mode <- Parser_insertion_mode.After_head; 884 process_token t token 885 | Token.Tag { kind = Token.Start; name = "template"; attrs; _ } -> 886 let node = Dom.create_template ~attrs () in 887 let (parent, _) = appropriate_insertion_place t in 888 Dom.append_child parent node; 889 t.open_elements <- node :: t.open_elements; 890 push_formatting_marker t; 891 t.frameset_ok <- false; 892 t.mode <- Parser_insertion_mode.In_template; 893 t.template_modes <- Parser_insertion_mode.In_template :: t.template_modes 894 | Token.Tag { kind = Token.End; name = "template"; _ } -> 895 if not (List.exists (fun n -> n.Dom.name = "template" && is_in_html_namespace n) t.open_elements) then 896 parse_error t "unexpected-end-tag" 897 else begin 898 generate_all_implied_end_tags t; 899 (match current_node t with 900 | Some n when not (n.Dom.name = "template" && is_in_html_namespace n) -> parse_error t "unexpected-end-tag" 901 | _ -> ()); 902 pop_until_html_tag t "template"; 903 clear_active_formatting_to_marker t; 904 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 905 reset_insertion_mode t 906 end 907 | Token.Tag { kind = Token.Start; name = "head"; _ } -> 908 parse_error t "unexpected-start-tag" 909 | Token.Tag { kind = Token.End; name; _ } -> 910 parse_error t ("unexpected-end-tag:" ^ name) 911 | _ -> 912 pop_current t; 913 t.mode <- Parser_insertion_mode.After_head; 914 process_token t token 915 916and process_in_head_noscript t token = 917 match token with 918 | Token.Character data when is_whitespace data -> 919 process_in_head t token 920 | Token.Character _ -> 921 parse_error t "unexpected-char-in-noscript"; 922 pop_current t; (* Pop noscript *) 923 t.mode <- Parser_insertion_mode.In_head; 924 process_token t token 925 | Token.Comment _ -> 926 process_in_head t token 927 | Token.Doctype _ -> 928 parse_error t "unexpected-doctype" 929 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 930 process_in_body t token 931 | Token.Tag { kind = Token.Start; name; _ } 932 when List.mem name ["basefont"; "bgsound"; "link"; "meta"; "noframes"; "style"] -> 933 process_in_head t token 934 | Token.Tag { kind = Token.Start; name; _ } 935 when List.mem name ["head"; "noscript"] -> 936 parse_error t "unexpected-start-tag" 937 | Token.Tag { kind = Token.Start; name; _ } -> 938 parse_error t ("bad-start-tag-in-head-noscript:" ^ name); 939 pop_current t; (* Pop noscript *) 940 t.mode <- Parser_insertion_mode.In_head; 941 process_token t token 942 | Token.Tag { kind = Token.End; name = "noscript"; _ } -> 943 pop_current t; (* Pop noscript *) 944 t.mode <- Parser_insertion_mode.In_head 945 | Token.Tag { kind = Token.End; name = "br"; _ } -> 946 parse_error t "unexpected-end-tag"; 947 pop_current t; (* Pop noscript *) 948 t.mode <- Parser_insertion_mode.In_head; 949 process_token t token 950 | Token.Tag { kind = Token.End; name; _ } -> 951 parse_error t ("unexpected-end-tag:" ^ name) 952 | Token.EOF -> 953 parse_error t "expected-closing-tag-but-got-eof"; 954 pop_current t; (* Pop noscript *) 955 t.mode <- Parser_insertion_mode.In_head; 956 process_token t token 957 958and process_after_head t token = 959 match token with 960 | Token.Character data when is_whitespace data -> 961 insert_character t data 962 | Token.Comment data -> 963 insert_comment t data 964 | Token.Doctype _ -> 965 parse_error t "unexpected-doctype" 966 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 967 process_in_body t token 968 | Token.Tag { kind = Token.Start; name = "body"; attrs; _ } -> 969 ignore (insert_element t "body" ~push:true attrs); 970 t.frameset_ok <- false; 971 t.mode <- Parser_insertion_mode.In_body 972 | Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } -> 973 ignore (insert_element t "frameset" ~push:true attrs); 974 t.mode <- Parser_insertion_mode.In_frameset 975 | Token.Tag { kind = Token.Start; name = "input"; attrs; _ } -> 976 (* Special handling for input type="hidden" - parse error, ignore *) 977 let input_type = List.assoc_opt "type" attrs in 978 (match input_type with 979 | Some typ when String.lowercase_ascii typ = "hidden" -> 980 parse_error t "unexpected-hidden-input-after-head" 981 | _ -> 982 (* Non-hidden input creates body *) 983 let body = insert_element t "body" [] in 984 t.open_elements <- body :: t.open_elements; 985 t.mode <- Parser_insertion_mode.In_body; 986 process_token t token) 987 | Token.Tag { kind = Token.Start; name; _ } 988 when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] -> 989 parse_error t "unexpected-start-tag"; 990 (match t.head_element with 991 | Some head -> 992 t.open_elements <- head :: t.open_elements; 993 process_in_head t token; 994 t.open_elements <- List.filter (fun n -> n != head) t.open_elements 995 | None -> ()) 996 | Token.Tag { kind = Token.End; name = "template"; _ } -> 997 process_in_head t token 998 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "html"; "br"] -> 999 let body = insert_element t "body" [] in 1000 t.open_elements <- body :: t.open_elements; 1001 t.mode <- Parser_insertion_mode.In_body; 1002 process_token t token 1003 | Token.Tag { kind = Token.Start; name = "head"; _ } -> 1004 parse_error t "unexpected-start-tag" 1005 | Token.Tag { kind = Token.End; name; _ } -> 1006 parse_error t ("unexpected-end-tag:" ^ name) 1007 | _ -> 1008 let body = insert_element t "body" [] in 1009 t.open_elements <- body :: t.open_elements; 1010 t.mode <- Parser_insertion_mode.In_body; 1011 process_token t token 1012 1013and process_in_body t token = 1014 match token with 1015 | Token.Character "\x00" -> 1016 parse_error t "unexpected-null-character" 1017 | Token.Character data -> 1018 reconstruct_active_formatting t; 1019 insert_character t data; 1020 if not (is_whitespace data) then t.frameset_ok <- false 1021 | Token.Comment data -> 1022 insert_comment t data 1023 | Token.Doctype _ -> 1024 parse_error t "unexpected-doctype" 1025 | Token.Tag { kind = Token.Start; name = "html"; attrs; _ } -> 1026 parse_error t "unexpected-start-tag"; 1027 if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then 1028 (* Find the html element (at the bottom of the stack) *) 1029 let html_elem = List.find_opt (fun n -> n.Dom.name = "html") t.open_elements in 1030 (match html_elem with 1031 | Some html -> 1032 List.iter (fun (k, v) -> 1033 if not (Dom.has_attr html k) then Dom.set_attr html k v 1034 ) attrs 1035 | None -> ()) 1036 | Token.Tag { kind = Token.Start; name; _ } 1037 when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] -> 1038 process_in_head t token 1039 | Token.Tag { kind = Token.End; name = "template"; _ } -> 1040 process_in_head t token 1041 | Token.Tag { kind = Token.Start; name = "body"; attrs; _ } -> 1042 parse_error t "unexpected-start-tag"; 1043 (* Find body element on stack - it should be near the end (html is last) *) 1044 let body = List.find_opt (fun n -> n.Dom.name = "body") t.open_elements in 1045 (match body with 1046 | Some body when not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) -> 1047 t.frameset_ok <- false; 1048 List.iter (fun (k, v) -> 1049 if not (Dom.has_attr body k) then Dom.set_attr body k v 1050 ) attrs 1051 | _ -> ()) 1052 | Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } -> 1053 if not t.frameset_ok then 1054 parse_error t "unexpected-start-tag-ignored" 1055 else begin 1056 (* Find body element on the stack *) 1057 let rec find_body_index idx = function 1058 | [] -> None 1059 | n :: rest -> 1060 if n.Dom.name = "body" then Some (idx, n) 1061 else find_body_index (idx + 1) rest 1062 in 1063 match find_body_index 0 t.open_elements with 1064 | None -> 1065 parse_error t "unexpected-start-tag-ignored" 1066 | Some (idx, body_elem) -> 1067 (* Remove body from its parent (the html element) *) 1068 (match body_elem.Dom.parent with 1069 | Some parent -> Dom.remove_child parent body_elem 1070 | None -> ()); 1071 (* Pop all elements up to and including body - keep only elements after body_idx *) 1072 let rec drop n lst = if n <= 0 then lst else match lst with [] -> [] | _ :: rest -> drop (n - 1) rest in 1073 t.open_elements <- drop (idx + 1) t.open_elements; 1074 (* Insert frameset element *) 1075 ignore (insert_element t "frameset" ~push:true attrs); 1076 t.mode <- Parser_insertion_mode.In_frameset 1077 end 1078 | Token.EOF -> 1079 if t.template_modes <> [] then 1080 process_in_template t token 1081 else begin 1082 let has_unclosed = List.exists (fun n -> 1083 not (List.mem n.Dom.name ["dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"; "body"; "html"]) 1084 ) t.open_elements in 1085 if has_unclosed then parse_error t "expected-closing-tag-but-got-eof" 1086 end 1087 | Token.Tag { kind = Token.End; name = "body"; _ } -> 1088 if not (has_element_in_scope t "body") then 1089 parse_error t "unexpected-end-tag" 1090 else begin 1091 let has_unclosed = List.exists (fun n -> 1092 not (List.mem n.Dom.name ["dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"; "body"; "html"]) 1093 ) t.open_elements in 1094 if has_unclosed then parse_error t "end-tag-too-early"; 1095 t.mode <- Parser_insertion_mode.After_body 1096 end 1097 | Token.Tag { kind = Token.End; name = "html"; _ } -> 1098 if not (has_element_in_scope t "body") then 1099 parse_error t "unexpected-end-tag" 1100 else begin 1101 t.mode <- Parser_insertion_mode.After_body; 1102 process_token t token 1103 end 1104 | Token.Tag { kind = Token.Start; name; attrs; _ } 1105 when List.mem name ["address"; "article"; "aside"; "blockquote"; "center"; "details"; "dialog"; "dir"; "div"; "dl"; "fieldset"; "figcaption"; "figure"; "footer"; "header"; "hgroup"; "main"; "menu"; "nav"; "ol"; "p"; "search"; "section"; "summary"; "ul"] -> 1106 if has_element_in_button_scope t "p" then close_p_element t; 1107 ignore (insert_element t name ~push:true attrs) 1108 | Token.Tag { kind = Token.Start; name; attrs; _ } when Parser_constants.is_heading_element name -> 1109 if has_element_in_button_scope t "p" then close_p_element t; 1110 (match current_node t with 1111 | Some n when Parser_constants.is_heading_element n.Dom.name -> 1112 parse_error t "unexpected-start-tag"; 1113 pop_current t 1114 | _ -> ()); 1115 ignore (insert_element t name ~push:true attrs) 1116 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["pre"; "listing"] -> 1117 if has_element_in_button_scope t "p" then close_p_element t; 1118 ignore (insert_element t name ~push:true attrs); 1119 t.ignore_lf <- true; 1120 t.frameset_ok <- false 1121 | Token.Tag { kind = Token.Start; name = "form"; attrs; _ } -> 1122 if t.form_element <> None && not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then 1123 parse_error t "unexpected-start-tag" 1124 else begin 1125 if has_element_in_button_scope t "p" then close_p_element t; 1126 let form = insert_element t "form" attrs in 1127 t.open_elements <- form :: t.open_elements; 1128 if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then 1129 t.form_element <- Some form 1130 end 1131 | Token.Tag { kind = Token.Start; name = "li"; attrs; _ } -> 1132 t.frameset_ok <- false; 1133 let rec check = function 1134 | [] -> () 1135 | n :: rest -> 1136 if n.Dom.name = "li" then begin 1137 generate_implied_end_tags t ~except:"li" (); 1138 (match current_node t with 1139 | Some cn when cn.Dom.name <> "li" -> parse_error t "unexpected-start-tag" 1140 | _ -> ()); 1141 pop_until_tag t "li" 1142 end else if is_special_element n && not (List.mem (String.lowercase_ascii n.Dom.name) ["address"; "div"; "p"]) then 1143 () 1144 else 1145 check rest 1146 in 1147 check t.open_elements; 1148 if has_element_in_button_scope t "p" then close_p_element t; 1149 ignore (insert_element t "li" ~push:true attrs) 1150 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["dd"; "dt"] -> 1151 t.frameset_ok <- false; 1152 let rec check = function 1153 | [] -> () 1154 | n :: rest -> 1155 if List.mem n.Dom.name ["dd"; "dt"] then begin 1156 generate_implied_end_tags t ~except:n.Dom.name (); 1157 (match current_node t with 1158 | Some cn when cn.Dom.name <> n.Dom.name -> parse_error t "unexpected-start-tag" 1159 | _ -> ()); 1160 pop_until_one_of t ["dd"; "dt"] 1161 end else if is_special_element n && not (List.mem (String.lowercase_ascii n.Dom.name) ["address"; "div"; "p"]) then 1162 () 1163 else 1164 check rest 1165 in 1166 check t.open_elements; 1167 if has_element_in_button_scope t "p" then close_p_element t; 1168 ignore (insert_element t name ~push:true attrs) 1169 | Token.Tag { kind = Token.Start; name = "plaintext"; _ } -> 1170 if has_element_in_button_scope t "p" then close_p_element t; 1171 ignore (insert_element t "plaintext" ~push:true []) 1172 (* Tokenizer should switch to PLAINTEXT state *) 1173 | Token.Tag { kind = Token.Start; name = "button"; attrs; _ } -> 1174 if has_element_in_scope t "button" then begin 1175 parse_error t "unexpected-start-tag"; 1176 generate_implied_end_tags t (); 1177 pop_until_tag t "button" 1178 end; 1179 reconstruct_active_formatting t; 1180 ignore (insert_element t "button" ~push:true attrs); 1181 t.frameset_ok <- false 1182 | Token.Tag { kind = Token.End; name; _ } 1183 when List.mem name ["address"; "article"; "aside"; "blockquote"; "button"; "center"; "details"; "dialog"; "dir"; "div"; "dl"; "fieldset"; "figcaption"; "figure"; "footer"; "header"; "hgroup"; "listing"; "main"; "menu"; "nav"; "ol"; "pre"; "search"; "section"; "summary"; "ul"] -> 1184 if not (has_element_in_scope t name) then 1185 parse_error t ("unexpected-end-tag:" ^ name) 1186 else begin 1187 generate_implied_end_tags t (); 1188 (match current_node t with 1189 | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early" 1190 | _ -> ()); 1191 pop_until_tag t name 1192 end 1193 | Token.Tag { kind = Token.End; name = "form"; _ } -> 1194 if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then begin 1195 let node = t.form_element in 1196 t.form_element <- None; 1197 match node with 1198 | None -> parse_error t "unexpected-end-tag" 1199 | Some form_node -> 1200 if not (has_element_in_scope t "form") then 1201 parse_error t "unexpected-end-tag" 1202 else begin 1203 generate_implied_end_tags t (); 1204 (match current_node t with 1205 | Some n when n != form_node -> parse_error t "end-tag-too-early" 1206 | _ -> ()); 1207 t.open_elements <- List.filter (fun n -> n != form_node) t.open_elements 1208 end 1209 end else begin 1210 if not (has_element_in_scope t "form") then 1211 parse_error t "unexpected-end-tag" 1212 else begin 1213 generate_implied_end_tags t (); 1214 (match current_node t with 1215 | Some n when n.Dom.name <> "form" -> parse_error t "end-tag-too-early" 1216 | _ -> ()); 1217 pop_until_tag t "form" 1218 end 1219 end 1220 | Token.Tag { kind = Token.End; name = "p"; _ } -> 1221 if not (has_element_in_button_scope t "p") then begin 1222 parse_error t "no-p-element-in-scope"; 1223 ignore (insert_element t "p" ~push:true []) 1224 end; 1225 close_p_element t 1226 | Token.Tag { kind = Token.End; name = "li"; _ } -> 1227 if not (has_element_in_list_item_scope t "li") then 1228 parse_error t "unexpected-end-tag" 1229 else begin 1230 generate_implied_end_tags t ~except:"li" (); 1231 (match current_node t with 1232 | Some n when n.Dom.name <> "li" -> parse_error t "end-tag-too-early" 1233 | _ -> ()); 1234 pop_until_tag t "li" 1235 end 1236 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["dd"; "dt"] -> 1237 if not (has_element_in_scope t name) then 1238 parse_error t "unexpected-end-tag" 1239 else begin 1240 generate_implied_end_tags t ~except:name (); 1241 (match current_node t with 1242 | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early" 1243 | _ -> ()); 1244 pop_until_tag t name 1245 end 1246 | Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_heading_element name -> 1247 if not (has_element_in_scope_impl t Parser_constants.heading_elements Parser_constants.default_scope ~check_integration_points:true) then 1248 parse_error t "unexpected-end-tag" 1249 else begin 1250 generate_implied_end_tags t (); 1251 (match current_node t with 1252 | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early" 1253 | _ -> ()); 1254 pop_until_one_of t Parser_constants.heading_elements 1255 end 1256 | Token.Tag { kind = Token.Start; name = "a"; attrs; _ } -> 1257 (* Check for existing <a> in active formatting *) 1258 let rec find_a = function 1259 | [] -> None 1260 | Marker :: _ -> None 1261 | Entry e :: _ when e.name = "a" -> Some e.node 1262 | _ :: rest -> find_a rest 1263 in 1264 (match find_a t.active_formatting with 1265 | Some existing -> 1266 parse_error t "unexpected-start-tag"; 1267 adoption_agency t "a"; 1268 t.active_formatting <- List.filter (function 1269 | Entry e -> e.node != existing 1270 | _ -> true 1271 ) t.active_formatting; 1272 t.open_elements <- List.filter (fun n -> n != existing) t.open_elements 1273 | None -> ()); 1274 reconstruct_active_formatting t; 1275 let node = insert_element t "a" attrs in 1276 t.open_elements <- node :: t.open_elements; 1277 push_formatting_element t node "a" attrs 1278 | Token.Tag { kind = Token.Start; name; attrs; _ } 1279 when List.mem name ["b"; "big"; "code"; "em"; "font"; "i"; "s"; "small"; "strike"; "strong"; "tt"; "u"] -> 1280 reconstruct_active_formatting t; 1281 let node = insert_element t name attrs in 1282 t.open_elements <- node :: t.open_elements; 1283 push_formatting_element t node name attrs 1284 | Token.Tag { kind = Token.Start; name = "nobr"; attrs; _ } -> 1285 if has_element_in_scope t "nobr" then begin 1286 parse_error t "unexpected-start-tag"; 1287 adoption_agency t "nobr"; 1288 (* Remove nobr from active formatting *) 1289 t.active_formatting <- List.filter (function 1290 | Entry e -> e.name <> "nobr" 1291 | Marker -> true 1292 ) t.active_formatting; 1293 (* Remove nobr from open elements *) 1294 t.open_elements <- List.filter (fun n -> n.Dom.name <> "nobr") t.open_elements 1295 end; 1296 reconstruct_active_formatting t; 1297 let node = insert_element t "nobr" attrs in 1298 t.open_elements <- node :: t.open_elements; 1299 push_formatting_element t node "nobr" attrs 1300 | Token.Tag { kind = Token.End; name; _ } 1301 when List.mem name ["a"; "b"; "big"; "code"; "em"; "font"; "i"; "nobr"; "s"; "small"; "strike"; "strong"; "tt"; "u"] -> 1302 adoption_agency t name 1303 | Token.Tag { kind = Token.Start; name; attrs; _ } 1304 when List.mem name ["applet"; "marquee"; "object"] -> 1305 reconstruct_active_formatting t; 1306 ignore (insert_element t name ~push:true attrs); 1307 push_formatting_marker t; 1308 t.frameset_ok <- false 1309 | Token.Tag { kind = Token.End; name; _ } 1310 when List.mem name ["applet"; "marquee"; "object"] -> 1311 if not (has_element_in_scope t name) then 1312 parse_error t "unexpected-end-tag" 1313 else begin 1314 generate_implied_end_tags t (); 1315 (match current_node t with 1316 | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early" 1317 | _ -> ()); 1318 pop_until_tag t name; 1319 clear_active_formatting_to_marker t 1320 end 1321 | Token.Tag { kind = Token.Start; name = "table"; attrs; _ } -> 1322 if t.quirks_mode <> Dom.Quirks && has_element_in_button_scope t "p" then 1323 close_p_element t; 1324 ignore (insert_element t "table" ~push:true attrs); 1325 t.frameset_ok <- false; 1326 t.mode <- Parser_insertion_mode.In_table 1327 | Token.Tag { kind = Token.End; name = "br"; _ } -> 1328 parse_error t "end-tag-br"; 1329 reconstruct_active_formatting t; 1330 ignore (insert_element t "br" ~push:true []); 1331 pop_current t; 1332 t.frameset_ok <- false 1333 | Token.Tag { kind = Token.Start; name; attrs; _ } 1334 when List.mem name ["area"; "br"; "embed"; "img"; "keygen"; "wbr"] -> 1335 reconstruct_active_formatting t; 1336 ignore (insert_element t name ~push:true attrs); 1337 pop_current t; 1338 t.frameset_ok <- false 1339 | Token.Tag { kind = Token.Start; name = "input"; attrs; _ } -> 1340 reconstruct_active_formatting t; 1341 ignore (insert_element t "input" ~push:true attrs); 1342 pop_current t; 1343 let is_hidden = List.exists (fun (k, v) -> 1344 String.lowercase_ascii k = "type" && String.lowercase_ascii v = "hidden" 1345 ) attrs in 1346 if not is_hidden then t.frameset_ok <- false 1347 | Token.Tag { kind = Token.Start; name; attrs; _ } 1348 when List.mem name ["param"; "source"; "track"] -> 1349 ignore (insert_element_for_token t { kind = Token.Start; name; attrs; self_closing = false }); 1350 pop_current t 1351 | Token.Tag { kind = Token.Start; name = "hr"; _ } -> 1352 if has_element_in_button_scope t "p" then close_p_element t; 1353 ignore (insert_element t "hr" ~push:true []); 1354 pop_current t; 1355 t.frameset_ok <- false 1356 | Token.Tag { kind = Token.Start; name = "image"; attrs; _ } -> 1357 parse_error t "unexpected-start-tag"; 1358 (* Treat <image> as <img> *) 1359 reconstruct_active_formatting t; 1360 ignore (insert_element t "img" ~push:true attrs); 1361 pop_current t; 1362 t.frameset_ok <- false 1363 | Token.Tag { kind = Token.Start; name = "textarea"; attrs; _ } -> 1364 ignore (insert_element t "textarea" ~push:true attrs); 1365 t.ignore_lf <- true; 1366 t.original_mode <- Some t.mode; 1367 t.frameset_ok <- false; 1368 t.mode <- Parser_insertion_mode.Text 1369 | Token.Tag { kind = Token.Start; name = "xmp"; attrs; _ } -> 1370 if has_element_in_button_scope t "p" then close_p_element t; 1371 reconstruct_active_formatting t; 1372 t.frameset_ok <- false; 1373 ignore (insert_element_for_token t { kind = Token.Start; name = "xmp"; attrs; self_closing = false }); 1374 t.original_mode <- Some t.mode; 1375 t.mode <- Parser_insertion_mode.Text 1376 | Token.Tag { kind = Token.Start; name = "iframe"; attrs; _ } -> 1377 t.frameset_ok <- false; 1378 ignore (insert_element_for_token t { kind = Token.Start; name = "iframe"; attrs; self_closing = false }); 1379 t.original_mode <- Some t.mode; 1380 t.mode <- Parser_insertion_mode.Text 1381 | Token.Tag { kind = Token.Start; name = "noembed"; attrs; _ } -> 1382 ignore (insert_element_for_token t { kind = Token.Start; name = "noembed"; attrs; self_closing = false }); 1383 t.original_mode <- Some t.mode; 1384 t.mode <- Parser_insertion_mode.Text 1385 | Token.Tag { kind = Token.Start; name = "select"; attrs; _ } -> 1386 reconstruct_active_formatting t; 1387 ignore (insert_element t "select" ~push:true attrs); 1388 t.frameset_ok <- false; 1389 if List.mem t.mode [Parser_insertion_mode.In_table; Parser_insertion_mode.In_caption; Parser_insertion_mode.In_table_body; Parser_insertion_mode.In_row; Parser_insertion_mode.In_cell] then 1390 t.mode <- Parser_insertion_mode.In_select_in_table 1391 else 1392 t.mode <- Parser_insertion_mode.In_select 1393 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["optgroup"; "option"] -> 1394 (match current_node t with 1395 | Some n when n.Dom.name = "option" -> pop_current t 1396 | _ -> ()); 1397 reconstruct_active_formatting t; 1398 ignore (insert_element t name ~push:true attrs) 1399 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["rb"; "rtc"] -> 1400 if has_element_in_scope t "ruby" then begin 1401 generate_implied_end_tags t () 1402 end; 1403 (match current_node t with 1404 | Some n when n.Dom.name <> "ruby" && n.Dom.name <> "rtc" -> parse_error t "unexpected-start-tag" 1405 | _ -> ()); 1406 ignore (insert_element t name ~push:true attrs) 1407 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["rp"; "rt"] -> 1408 if has_element_in_scope t "ruby" then begin 1409 generate_implied_end_tags t ~except:"rtc" () 1410 end; 1411 (match current_node t with 1412 | Some n when n.Dom.name <> "ruby" && n.Dom.name <> "rtc" -> parse_error t "unexpected-start-tag" 1413 | _ -> ()); 1414 ignore (insert_element t name ~push:true attrs) 1415 | Token.Tag { kind = Token.Start; name = "math"; attrs; self_closing } -> 1416 reconstruct_active_formatting t; 1417 let adjusted_attrs = Parser_constants.adjust_mathml_attrs (Parser_constants.adjust_foreign_attrs attrs) in 1418 ignore (insert_foreign_element t { kind = Token.Start; name = "math"; attrs = adjusted_attrs; self_closing } (Some "mathml")); 1419 if self_closing then pop_current t 1420 | Token.Tag { kind = Token.Start; name = "svg"; attrs; self_closing } -> 1421 reconstruct_active_formatting t; 1422 let adjusted_attrs = Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs attrs) in 1423 ignore (insert_foreign_element t { kind = Token.Start; name = "svg"; attrs = adjusted_attrs; self_closing } (Some "svg")); 1424 if self_closing then pop_current t 1425 | Token.Tag { kind = Token.Start; name; attrs; _ } 1426 when List.mem name ["col"; "frame"] -> 1427 (* In fragment context, insert these; otherwise ignore *) 1428 if t.fragment_context = None then 1429 parse_error t "unexpected-start-tag-ignored" 1430 else 1431 ignore (insert_element t name attrs) 1432 | Token.Tag { kind = Token.Start; name; _ } 1433 when List.mem name ["caption"; "colgroup"; "head"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> 1434 parse_error t "unexpected-start-tag" 1435 | Token.Tag { kind = Token.Start; name; attrs; self_closing } -> 1436 (* Any other start tag *) 1437 reconstruct_active_formatting t; 1438 ignore (insert_element t name ~push:true attrs); 1439 (* Check for self-closing on non-void HTML element *) 1440 if self_closing && not (Parser_constants.is_void_element name) then 1441 parse_error t "non-void-html-element-start-tag-with-trailing-solidus" 1442 | Token.Tag { kind = Token.End; name; _ } -> 1443 (* Any other end tag *) 1444 let rec check = function 1445 | [] -> () 1446 | node :: rest -> 1447 if node.Dom.name = name then begin 1448 generate_implied_end_tags t ~except:name (); 1449 (match current_node t with 1450 | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early" 1451 | _ -> ()); 1452 pop_until t (fun n -> n == node) 1453 end else if is_special_element node then 1454 parse_error t ("unexpected-end-tag:" ^ name) 1455 else 1456 check rest 1457 in 1458 check t.open_elements 1459 1460and process_text t token = 1461 match token with 1462 | Token.Character data -> 1463 insert_character t data 1464 | Token.EOF -> 1465 parse_error t "expected-closing-tag-but-got-eof"; 1466 pop_current t; 1467 t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_body; 1468 process_token t token 1469 | Token.Tag { kind = Token.End; _ } -> 1470 pop_current t; 1471 t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_body 1472 | _ -> () 1473 1474and process_in_table t token = 1475 match token with 1476 | Token.Character _ when (match current_node t with Some n -> Parser_constants.is_foster_parenting_element n.Dom.name | None -> false) -> 1477 t.pending_table_chars <- []; 1478 t.original_mode <- Some t.mode; 1479 t.mode <- Parser_insertion_mode.In_table_text; 1480 process_token t token 1481 | Token.Comment data -> 1482 insert_comment t data 1483 | Token.Doctype _ -> 1484 parse_error t "unexpected-doctype" 1485 | Token.Tag { kind = Token.Start; name = "caption"; attrs; _ } -> 1486 clear_stack_back_to_table_context t; 1487 push_formatting_marker t; 1488 ignore (insert_element t "caption" ~push:true attrs); 1489 t.mode <- Parser_insertion_mode.In_caption 1490 | Token.Tag { kind = Token.Start; name = "colgroup"; attrs; _ } -> 1491 clear_stack_back_to_table_context t; 1492 ignore (insert_element t "colgroup" ~push:true attrs); 1493 t.mode <- Parser_insertion_mode.In_column_group 1494 | Token.Tag { kind = Token.Start; name = "col"; _ } -> 1495 clear_stack_back_to_table_context t; 1496 ignore (insert_element t "colgroup" ~push:true []); 1497 t.mode <- Parser_insertion_mode.In_column_group; 1498 process_token t token 1499 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["tbody"; "tfoot"; "thead"] -> 1500 clear_stack_back_to_table_context t; 1501 ignore (insert_element t name ~push:true attrs); 1502 t.mode <- Parser_insertion_mode.In_table_body 1503 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["td"; "th"; "tr"] -> 1504 clear_stack_back_to_table_context t; 1505 ignore (insert_element t "tbody" ~push:true []); 1506 t.mode <- Parser_insertion_mode.In_table_body; 1507 process_token t token 1508 | Token.Tag { kind = Token.Start; name = "table"; _ } -> 1509 parse_error t "unexpected-start-tag"; 1510 if has_element_in_table_scope t "table" then begin 1511 pop_until_tag t "table"; 1512 reset_insertion_mode t; 1513 process_token t token 1514 end 1515 | Token.Tag { kind = Token.End; name = "table"; _ } -> 1516 if not (has_element_in_table_scope t "table") then 1517 parse_error t "unexpected-end-tag" 1518 else begin 1519 pop_until_tag t "table"; 1520 reset_insertion_mode t 1521 end 1522 | Token.Tag { kind = Token.End; name; _ } 1523 when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> 1524 parse_error t "unexpected-end-tag" 1525 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["style"; "script"; "template"] -> 1526 process_in_head t token 1527 | Token.Tag { kind = Token.End; name = "template"; _ } -> 1528 process_in_head t token 1529 | Token.Tag { kind = Token.Start; name = "input"; attrs; _ } -> 1530 let is_hidden = List.exists (fun (k, v) -> 1531 String.lowercase_ascii k = "type" && String.lowercase_ascii v = "hidden" 1532 ) attrs in 1533 if not is_hidden then begin 1534 parse_error t "start-tag-in-table:input"; 1535 t.foster_parenting <- true; 1536 process_in_body t token; 1537 t.foster_parenting <- false 1538 end else begin 1539 parse_error t "start-tag-in-table:input"; 1540 ignore (insert_element t "input" ~push:true attrs); 1541 pop_current t 1542 end 1543 | Token.Tag { kind = Token.Start; name = "form"; attrs; _ } -> 1544 parse_error t "unexpected-start-tag"; 1545 if t.form_element = None && not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then begin 1546 let form = insert_element t "form" attrs in 1547 t.open_elements <- form :: t.open_elements; 1548 t.form_element <- Some form; 1549 pop_current t 1550 end 1551 | Token.EOF -> 1552 process_in_body t token 1553 | _ -> 1554 parse_error t "unexpected-token-in-table"; 1555 t.foster_parenting <- true; 1556 process_in_body t token; 1557 t.foster_parenting <- false 1558 1559and clear_stack_back_to_table_context t = 1560 let rec loop () = 1561 match current_node t with 1562 | Some n when not (List.mem n.Dom.name ["table"; "template"; "html"]) -> 1563 pop_current t; 1564 loop () 1565 | _ -> () 1566 in 1567 loop () 1568 1569and process_in_table_text t token = 1570 match token with 1571 | Token.Character data -> 1572 if String.contains data '\x00' then 1573 parse_error t "unexpected-null-character" 1574 else 1575 t.pending_table_chars <- data :: t.pending_table_chars 1576 | _ -> 1577 let pending = String.concat "" (List.rev t.pending_table_chars) in 1578 t.pending_table_chars <- []; 1579 if not (is_whitespace pending) then begin 1580 parse_error t "unexpected-character-in-table"; 1581 t.foster_parenting <- true; 1582 reconstruct_active_formatting t; 1583 insert_character t pending; 1584 t.foster_parenting <- false 1585 end else 1586 insert_character t pending; 1587 t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_table; 1588 process_token t token 1589 1590and process_in_caption t token = 1591 match token with 1592 | Token.Tag { kind = Token.End; name = "caption"; _ } -> 1593 if not (has_element_in_table_scope t "caption") then 1594 parse_error t "unexpected-end-tag" 1595 else begin 1596 generate_implied_end_tags t (); 1597 (match current_node t with 1598 | Some n when n.Dom.name <> "caption" -> parse_error t "end-tag-too-early" 1599 | _ -> ()); 1600 pop_until_tag t "caption"; 1601 clear_active_formatting_to_marker t; 1602 t.mode <- Parser_insertion_mode.In_table 1603 end 1604 | Token.Tag { kind = Token.Start; name; _ } 1605 when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> 1606 if not (has_element_in_table_scope t "caption") then 1607 parse_error t "unexpected-start-tag" 1608 else begin 1609 generate_implied_end_tags t (); 1610 pop_until_tag t "caption"; 1611 clear_active_formatting_to_marker t; 1612 t.mode <- Parser_insertion_mode.In_table; 1613 process_token t token 1614 end 1615 | Token.Tag { kind = Token.End; name = "table"; _ } -> 1616 if not (has_element_in_table_scope t "caption") then 1617 parse_error t "unexpected-end-tag" 1618 else begin 1619 generate_implied_end_tags t (); 1620 pop_until_tag t "caption"; 1621 clear_active_formatting_to_marker t; 1622 t.mode <- Parser_insertion_mode.In_table; 1623 process_token t token 1624 end 1625 | Token.Tag { kind = Token.End; name; _ } 1626 when List.mem name ["body"; "col"; "colgroup"; "html"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> 1627 parse_error t "unexpected-end-tag" 1628 | _ -> 1629 process_in_body t token 1630 1631and process_in_column_group t token = 1632 match token with 1633 | Token.Character data when is_whitespace data -> 1634 insert_character t data 1635 | Token.Character data -> 1636 (* Split leading whitespace from non-whitespace *) 1637 let ws_chars = [' '; '\t'; '\n'; '\x0C'; '\r'] in 1638 let len = String.length data in 1639 let ws_end = ref 0 in 1640 while !ws_end < len && List.mem data.[!ws_end] ws_chars do incr ws_end done; 1641 if !ws_end > 0 then 1642 insert_character t (String.sub data 0 !ws_end); 1643 if !ws_end < len then begin 1644 let remaining = String.sub data !ws_end (len - !ws_end) in 1645 (match current_node t with 1646 | Some n when n.Dom.name = "colgroup" -> 1647 pop_current t; 1648 t.mode <- Parser_insertion_mode.In_table; 1649 process_token t (Token.Character remaining) 1650 | _ -> 1651 parse_error t "unexpected-token") 1652 end 1653 | Token.Comment data -> 1654 insert_comment t data 1655 | Token.Doctype _ -> 1656 parse_error t "unexpected-doctype" 1657 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 1658 process_in_body t token 1659 | Token.Tag { kind = Token.Start; name = "col"; attrs; _ } -> 1660 ignore (insert_element t "col" ~push:true attrs); 1661 pop_current t 1662 | Token.Tag { kind = Token.End; name = "colgroup"; _ } -> 1663 (match current_node t with 1664 | Some n when n.Dom.name <> "colgroup" -> parse_error t "unexpected-end-tag" 1665 | Some _ -> pop_current t; t.mode <- Parser_insertion_mode.In_table 1666 | None -> parse_error t "unexpected-end-tag") 1667 | Token.Tag { kind = Token.End; name = "col"; _ } -> 1668 parse_error t "unexpected-end-tag" 1669 | Token.Tag { kind = Token.Start; name = "template"; _ } 1670 | Token.Tag { kind = Token.End; name = "template"; _ } -> 1671 process_in_head t token 1672 | Token.EOF -> 1673 process_in_body t token 1674 | _ -> 1675 (match current_node t with 1676 | Some n when n.Dom.name = "colgroup" -> 1677 pop_current t; 1678 t.mode <- Parser_insertion_mode.In_table; 1679 process_token t token 1680 | _ -> 1681 parse_error t "unexpected-token") 1682 1683and process_in_table_body t token = 1684 match token with 1685 | Token.Tag { kind = Token.Start; name = "tr"; attrs; _ } -> 1686 clear_stack_back_to_table_body_context t; 1687 ignore (insert_element t "tr" ~push:true attrs); 1688 t.mode <- Parser_insertion_mode.In_row 1689 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["th"; "td"] -> 1690 parse_error t "unexpected-start-tag"; 1691 clear_stack_back_to_table_body_context t; 1692 ignore (insert_element t "tr" ~push:true []); 1693 t.mode <- Parser_insertion_mode.In_row; 1694 process_token t token 1695 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["tbody"; "tfoot"; "thead"] -> 1696 if not (has_element_in_table_scope t name) then 1697 parse_error t "unexpected-end-tag" 1698 else begin 1699 clear_stack_back_to_table_body_context t; 1700 pop_current t; 1701 t.mode <- Parser_insertion_mode.In_table 1702 end 1703 | Token.Tag { kind = Token.Start; name; _ } 1704 when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "tfoot"; "thead"] -> 1705 if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Parser_constants.table_scope ~check_integration_points:false) then 1706 parse_error t "unexpected-start-tag" 1707 else begin 1708 clear_stack_back_to_table_body_context t; 1709 pop_current t; 1710 t.mode <- Parser_insertion_mode.In_table; 1711 process_token t token 1712 end 1713 | Token.Tag { kind = Token.End; name = "table"; _ } -> 1714 if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Parser_constants.table_scope ~check_integration_points:false) then 1715 parse_error t "unexpected-end-tag" 1716 else begin 1717 clear_stack_back_to_table_body_context t; 1718 pop_current t; 1719 t.mode <- Parser_insertion_mode.In_table; 1720 process_token t token 1721 end 1722 | Token.Tag { kind = Token.End; name; _ } 1723 when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "td"; "th"; "tr"] -> 1724 parse_error t "unexpected-end-tag" 1725 | _ -> 1726 process_in_table t token 1727 1728and clear_stack_back_to_table_body_context t = 1729 let rec loop () = 1730 match current_node t with 1731 | Some n when not (List.mem n.Dom.name ["tbody"; "tfoot"; "thead"; "template"; "html"]) -> 1732 pop_current t; 1733 loop () 1734 | _ -> () 1735 in 1736 loop () 1737 1738and process_in_row t token = 1739 match token with 1740 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["th"; "td"] -> 1741 clear_stack_back_to_table_row_context t; 1742 ignore (insert_element t name ~push:true attrs); 1743 t.mode <- Parser_insertion_mode.In_cell; 1744 push_formatting_marker t 1745 | Token.Tag { kind = Token.End; name = "tr"; _ } -> 1746 if not (has_element_in_table_scope t "tr") then 1747 parse_error t "unexpected-end-tag" 1748 else begin 1749 clear_stack_back_to_table_row_context t; 1750 pop_current t; 1751 t.mode <- Parser_insertion_mode.In_table_body 1752 end 1753 | Token.Tag { kind = Token.Start; name; _ } 1754 when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "tfoot"; "thead"; "tr"] -> 1755 if not (has_element_in_table_scope t "tr") then 1756 parse_error t "unexpected-start-tag" 1757 else begin 1758 clear_stack_back_to_table_row_context t; 1759 pop_current t; 1760 t.mode <- Parser_insertion_mode.In_table_body; 1761 process_token t token 1762 end 1763 | Token.Tag { kind = Token.End; name = "table"; _ } -> 1764 if not (has_element_in_table_scope t "tr") then 1765 parse_error t "unexpected-end-tag" 1766 else begin 1767 clear_stack_back_to_table_row_context t; 1768 pop_current t; 1769 t.mode <- Parser_insertion_mode.In_table_body; 1770 process_token t token 1771 end 1772 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["tbody"; "tfoot"; "thead"] -> 1773 if not (has_element_in_table_scope t name) then 1774 parse_error t "unexpected-end-tag" 1775 else if not (has_element_in_table_scope t "tr") then 1776 parse_error t "unexpected-end-tag" 1777 else begin 1778 clear_stack_back_to_table_row_context t; 1779 pop_current t; 1780 t.mode <- Parser_insertion_mode.In_table_body; 1781 process_token t token 1782 end 1783 | Token.Tag { kind = Token.End; name; _ } 1784 when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "td"; "th"] -> 1785 parse_error t "unexpected-end-tag" 1786 | _ -> 1787 process_in_table t token 1788 1789and clear_stack_back_to_table_row_context t = 1790 let rec loop () = 1791 match current_node t with 1792 | Some n when not (List.mem n.Dom.name ["tr"; "template"; "html"]) -> 1793 pop_current t; 1794 loop () 1795 | _ -> () 1796 in 1797 loop () 1798 1799and process_in_cell t token = 1800 match token with 1801 | Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_table_cell_element name -> 1802 if not (has_element_in_table_scope t name) then 1803 parse_error t "unexpected-end-tag" 1804 else begin 1805 generate_implied_end_tags t (); 1806 (match current_node t with 1807 | Some n when not (n.Dom.name = name && is_in_html_namespace n) -> parse_error t "end-tag-too-early" 1808 | _ -> ()); 1809 pop_until_html_tag t name; 1810 clear_active_formatting_to_marker t; 1811 t.mode <- Parser_insertion_mode.In_row 1812 end 1813 | Token.Tag { kind = Token.Start; name; _ } 1814 when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> 1815 if not (has_element_in_scope_impl t ["td"; "th"] Parser_constants.table_scope ~check_integration_points:false) then 1816 parse_error t "unexpected-start-tag" 1817 else begin 1818 close_cell t; 1819 process_token t token 1820 end 1821 | Token.Tag { kind = Token.End; name; _ } 1822 when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"] -> 1823 parse_error t "unexpected-end-tag" 1824 | Token.Tag { kind = Token.End; name; _ } 1825 when Parser_constants.is_foster_parenting_element name -> 1826 if not (has_element_in_table_scope t name) then 1827 parse_error t "unexpected-end-tag" 1828 else begin 1829 close_cell t; 1830 process_token t token 1831 end 1832 | _ -> 1833 process_in_body t token 1834 1835and close_cell t = 1836 generate_implied_end_tags t (); 1837 (match current_node t with 1838 | Some n when not (Parser_constants.is_table_cell_element n.Dom.name && is_in_html_namespace n) -> parse_error t "end-tag-too-early" 1839 | _ -> ()); 1840 pop_until_html_one_of t ["td"; "th"]; 1841 clear_active_formatting_to_marker t; 1842 t.mode <- Parser_insertion_mode.In_row 1843 1844and process_in_select t token = 1845 match token with 1846 | Token.Character "\x00" -> 1847 parse_error t "unexpected-null-character" 1848 | Token.Character data -> 1849 reconstruct_active_formatting t; 1850 insert_character t data 1851 | Token.Comment data -> 1852 insert_comment t data 1853 | Token.Doctype _ -> 1854 parse_error t "unexpected-doctype" 1855 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 1856 process_in_body t token 1857 | Token.Tag { kind = Token.Start; name = "option"; attrs; _ } -> 1858 (match current_node t with 1859 | Some n when n.Dom.name = "option" -> pop_current t 1860 | _ -> ()); 1861 reconstruct_active_formatting t; 1862 ignore (insert_element t "option" ~push:true attrs) 1863 | Token.Tag { kind = Token.Start; name = "optgroup"; attrs; _ } -> 1864 (match current_node t with 1865 | Some n when n.Dom.name = "option" -> pop_current t 1866 | _ -> ()); 1867 (match current_node t with 1868 | Some n when n.Dom.name = "optgroup" -> pop_current t 1869 | _ -> ()); 1870 reconstruct_active_formatting t; 1871 ignore (insert_element t "optgroup" ~push:true attrs) 1872 | Token.Tag { kind = Token.Start; name = "hr"; _ } -> 1873 (match current_node t with 1874 | Some n when n.Dom.name = "option" -> pop_current t 1875 | _ -> ()); 1876 (match current_node t with 1877 | Some n when n.Dom.name = "optgroup" -> pop_current t 1878 | _ -> ()); 1879 ignore (insert_element t "hr" ~push:true []); 1880 pop_current t 1881 | Token.Tag { kind = Token.End; name = "optgroup"; _ } -> 1882 (match t.open_elements with 1883 | opt :: optg :: _ when opt.Dom.name = "option" && optg.Dom.name = "optgroup" -> 1884 pop_current t 1885 | _ -> ()); 1886 (match current_node t with 1887 | Some n when n.Dom.name = "optgroup" -> pop_current t 1888 | _ -> parse_error t "unexpected-end-tag") 1889 | Token.Tag { kind = Token.End; name = "option"; _ } -> 1890 (match current_node t with 1891 | Some n when n.Dom.name = "option" -> pop_current t 1892 | _ -> parse_error t "unexpected-end-tag") 1893 | Token.Tag { kind = Token.End; name = "select"; _ } -> 1894 if not (has_element_in_select_scope t "select") then 1895 parse_error t "unexpected-end-tag" 1896 else begin 1897 pop_until_tag t "select"; 1898 reset_insertion_mode t 1899 end 1900 | Token.Tag { kind = Token.Start; name = "select"; _ } -> 1901 parse_error t "unexpected-start-tag"; 1902 (* Per spec: in IN_SELECT mode, select is always on the stack - just pop *) 1903 pop_until_tag t "select"; 1904 reset_insertion_mode t 1905 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["input"; "textarea"] -> 1906 parse_error t "unexpected-start-tag"; 1907 (* Per spec: in IN_SELECT mode, select is always on the stack - just pop *) 1908 pop_until_tag t "select"; 1909 reset_insertion_mode t; 1910 process_token t token 1911 | Token.Tag { kind = Token.Start; name = "plaintext"; attrs; _ } -> 1912 (* plaintext is allowed in select - creates element, parser will switch tokenizer to PLAINTEXT mode *) 1913 reconstruct_active_formatting t; 1914 ignore (insert_element t "plaintext" ~push:true attrs) 1915 | Token.Tag { kind = Token.Start; name = "menuitem"; attrs; _ } -> 1916 (* menuitem is allowed in select *) 1917 reconstruct_active_formatting t; 1918 ignore (insert_element t "menuitem" ~push:true attrs) 1919 | Token.Tag { kind = Token.Start; name = "keygen"; attrs; _ } -> 1920 (* keygen is handled specially in select - inserted directly *) 1921 reconstruct_active_formatting t; 1922 ignore (insert_element t "keygen" attrs) 1923 (* Void element, don't push to stack *) 1924 | Token.Tag { kind = Token.Start; name = "svg"; attrs; self_closing } -> 1925 reconstruct_active_formatting t; 1926 let node = insert_foreign_element t { kind = Token.Start; name = "svg"; attrs; self_closing } (Some "svg") in 1927 if not self_closing then t.open_elements <- node :: t.open_elements 1928 | Token.Tag { kind = Token.Start; name = "math"; attrs; self_closing } -> 1929 reconstruct_active_formatting t; 1930 let node = insert_foreign_element t { kind = Token.Start; name = "math"; attrs; self_closing } (Some "mathml") in 1931 if not self_closing then t.open_elements <- node :: t.open_elements 1932 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["script"; "template"] -> 1933 process_in_head t token 1934 | Token.Tag { kind = Token.End; name = "template"; _ } -> 1935 process_in_head t token 1936 (* Allow certain HTML elements in select - newer spec behavior *) 1937 | Token.Tag { kind = Token.Start; name; attrs; self_closing } when List.mem name ["p"; "div"; "span"; "button"; "datalist"; "selectedcontent"] -> 1938 reconstruct_active_formatting t; 1939 let node = insert_element t name attrs in 1940 if not self_closing then t.open_elements <- node :: t.open_elements 1941 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["br"; "img"] -> 1942 reconstruct_active_formatting t; 1943 ignore (insert_element t name attrs) 1944 (* Don't push to stack - void elements *) 1945 (* Handle formatting elements in select *) 1946 | Token.Tag { kind = Token.Start; name; attrs; _ } when Parser_constants.is_formatting_element name -> 1947 reconstruct_active_formatting t; 1948 let node = insert_element t name ~push:true attrs in 1949 push_formatting_element t node name attrs 1950 | Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_formatting_element name -> 1951 (* Find select element and check if formatting element is inside select *) 1952 let select_idx = ref None in 1953 let fmt_idx = ref None in 1954 List.iteri (fun i n -> 1955 if n.Dom.name = "select" && !select_idx = None then select_idx := Some i; 1956 if n.Dom.name = name then fmt_idx := Some i 1957 ) t.open_elements; 1958 (match !fmt_idx, !select_idx with 1959 | Some fi, Some si when fi < si -> 1960 (* Formatting element is inside select, run adoption agency *) 1961 adoption_agency t name 1962 | Some _, Some _ -> 1963 (* Formatting element is outside select boundary - parse error, ignore *) 1964 parse_error t "unexpected-end-tag" 1965 | Some _, None -> 1966 adoption_agency t name 1967 | None, _ -> 1968 parse_error t "unexpected-end-tag") 1969 (* End tags for HTML elements allowed in select *) 1970 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["p"; "div"; "span"; "button"; "datalist"; "selectedcontent"] -> 1971 (* Find select and target indices *) 1972 let select_idx = ref None in 1973 let target_idx = ref None in 1974 List.iteri (fun i n -> 1975 if n.Dom.name = "select" && !select_idx = None then select_idx := Some i; 1976 if n.Dom.name = name then target_idx := Some i 1977 ) t.open_elements; 1978 (* Only pop if target exists and is inside select (lower index = closer to current) *) 1979 (match !target_idx, !select_idx with 1980 | Some ti, Some si when ti < si -> 1981 (* Pop until we reach the target *) 1982 let rec pop_to_target () = 1983 match t.open_elements with 1984 | [] -> () 1985 | n :: rest -> 1986 t.open_elements <- rest; 1987 if n.Dom.name <> name then pop_to_target () 1988 in 1989 pop_to_target () 1990 | Some _, Some _ -> 1991 parse_error t "unexpected-end-tag" 1992 | Some _, None -> 1993 (* No select on stack, just pop to target *) 1994 let rec pop_to_target () = 1995 match t.open_elements with 1996 | [] -> () 1997 | n :: rest -> 1998 t.open_elements <- rest; 1999 if n.Dom.name <> name then pop_to_target () 2000 in 2001 pop_to_target () 2002 | None, _ -> 2003 parse_error t "unexpected-end-tag") 2004 | Token.EOF -> 2005 process_in_body t token 2006 | _ -> 2007 parse_error t "unexpected-token-in-select" 2008 2009and process_in_select_in_table t token = 2010 match token with 2011 | Token.Tag { kind = Token.Start; name; _ } 2012 when List.mem name ["caption"; "table"; "tbody"; "tfoot"; "thead"; "tr"; "td"; "th"] -> 2013 parse_error t "unexpected-start-tag"; 2014 pop_until_tag t "select"; 2015 reset_insertion_mode t; 2016 process_token t token 2017 | Token.Tag { kind = Token.End; name; _ } 2018 when List.mem name ["caption"; "table"; "tbody"; "tfoot"; "thead"; "tr"; "td"; "th"] -> 2019 parse_error t "unexpected-end-tag"; 2020 if has_element_in_table_scope t name then begin 2021 pop_until_tag t "select"; 2022 reset_insertion_mode t; 2023 process_token t token 2024 end 2025 | _ -> 2026 process_in_select t token 2027 2028and process_in_template t token = 2029 match token with 2030 | Token.Character _ | Token.Comment _ | Token.Doctype _ -> 2031 process_in_body t token 2032 | Token.Tag { kind = Token.Start; name; _ } 2033 when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] -> 2034 process_in_head t token 2035 | Token.Tag { kind = Token.End; name = "template"; _ } -> 2036 process_in_head t token 2037 | Token.Tag { kind = Token.Start; name; _ } 2038 when List.mem name ["caption"; "colgroup"; "tbody"; "tfoot"; "thead"] -> 2039 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2040 t.template_modes <- Parser_insertion_mode.In_table :: t.template_modes; 2041 t.mode <- Parser_insertion_mode.In_table; 2042 process_token t token 2043 | Token.Tag { kind = Token.Start; name = "col"; _ } -> 2044 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2045 t.template_modes <- Parser_insertion_mode.In_column_group :: t.template_modes; 2046 t.mode <- Parser_insertion_mode.In_column_group; 2047 process_token t token 2048 | Token.Tag { kind = Token.Start; name = "tr"; _ } -> 2049 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2050 t.template_modes <- Parser_insertion_mode.In_table_body :: t.template_modes; 2051 t.mode <- Parser_insertion_mode.In_table_body; 2052 process_token t token 2053 | Token.Tag { kind = Token.Start; name; _ } when Parser_constants.is_table_cell_element name -> 2054 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2055 t.template_modes <- Parser_insertion_mode.In_row :: t.template_modes; 2056 t.mode <- Parser_insertion_mode.In_row; 2057 process_token t token 2058 | Token.Tag { kind = Token.Start; _ } -> 2059 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2060 t.template_modes <- Parser_insertion_mode.In_body :: t.template_modes; 2061 t.mode <- Parser_insertion_mode.In_body; 2062 process_token t token 2063 | Token.Tag { kind = Token.End; name; _ } -> 2064 parse_error t ("unexpected-end-tag:" ^ name) 2065 | Token.EOF -> 2066 if not (List.exists (fun n -> n.Dom.name = "template" && is_in_html_namespace n) t.open_elements) then 2067 () (* Stop parsing *) 2068 else begin 2069 parse_error t "expected-closing-tag-but-got-eof"; 2070 pop_until_html_tag t "template"; 2071 clear_active_formatting_to_marker t; 2072 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2073 reset_insertion_mode t; 2074 process_token t token 2075 end 2076 2077and process_after_body t token = 2078 match token with 2079 | Token.Character data when is_whitespace data -> 2080 process_in_body t token 2081 | Token.Comment data -> 2082 (* Insert as last child of html element - html is at bottom of stack *) 2083 let html_opt = List.find_opt (fun n -> n.Dom.name = "html") t.open_elements in 2084 (match html_opt with 2085 | Some html -> 2086 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in 2087 Dom.append_child html (Dom.create_comment ~location data) 2088 | None -> ()) 2089 | Token.Doctype _ -> 2090 parse_error t "unexpected-doctype" 2091 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 2092 process_in_body t token 2093 | Token.Tag { kind = Token.End; name = "html"; _ } -> 2094 if t.fragment_context <> None then 2095 parse_error t "unexpected-end-tag" 2096 else 2097 t.mode <- Parser_insertion_mode.After_after_body 2098 | Token.EOF -> 2099 () (* Stop parsing *) 2100 | _ -> 2101 parse_error t "unexpected-token-after-body"; 2102 t.mode <- Parser_insertion_mode.In_body; 2103 process_token t token 2104 2105and process_in_frameset t token = 2106 match token with 2107 | Token.Character data -> 2108 (* Extract only whitespace characters and insert them *) 2109 let whitespace = String.to_seq data 2110 |> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' ']) 2111 |> String.of_seq in 2112 if whitespace <> "" then insert_character t whitespace; 2113 if not (is_whitespace data) then 2114 parse_error t "unexpected-char-in-frameset" 2115 | Token.Comment data -> 2116 insert_comment t data 2117 | Token.Doctype _ -> 2118 parse_error t "unexpected-doctype" 2119 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 2120 process_in_body t token 2121 | Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } -> 2122 ignore (insert_element t "frameset" ~push:true attrs) 2123 | Token.Tag { kind = Token.End; name = "frameset"; _ } -> 2124 (match current_node t with 2125 | Some n when n.Dom.name = "html" -> parse_error t "unexpected-end-tag" 2126 | _ -> 2127 pop_current t; 2128 if t.fragment_context = None then 2129 (match current_node t with 2130 | Some n when n.Dom.name <> "frameset" -> t.mode <- Parser_insertion_mode.After_frameset 2131 | _ -> ())) 2132 | Token.Tag { kind = Token.Start; name = "frame"; attrs; _ } -> 2133 ignore (insert_element t "frame" ~push:true attrs); 2134 pop_current t 2135 | Token.Tag { kind = Token.Start; name = "noframes"; _ } -> 2136 process_in_head t token 2137 | Token.EOF -> 2138 (match current_node t with 2139 | Some n when n.Dom.name <> "html" -> parse_error t "expected-closing-tag-but-got-eof" 2140 | _ -> ()) 2141 | _ -> 2142 parse_error t "unexpected-token-in-frameset" 2143 2144and process_after_frameset t token = 2145 match token with 2146 | Token.Character data -> 2147 (* Extract only whitespace characters and insert them *) 2148 let whitespace = String.to_seq data 2149 |> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' ']) 2150 |> String.of_seq in 2151 if whitespace <> "" then insert_character t whitespace; 2152 if not (is_whitespace data) then 2153 parse_error t "unexpected-char-after-frameset" 2154 | Token.Comment data -> 2155 insert_comment t data 2156 | Token.Doctype _ -> 2157 parse_error t "unexpected-doctype" 2158 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 2159 process_in_body t token 2160 | Token.Tag { kind = Token.End; name = "html"; _ } -> 2161 t.mode <- Parser_insertion_mode.After_after_frameset 2162 | Token.Tag { kind = Token.Start; name = "noframes"; _ } -> 2163 process_in_head t token 2164 | Token.EOF -> 2165 () (* Stop parsing *) 2166 | _ -> 2167 parse_error t "unexpected-token-after-frameset" 2168 2169and process_after_after_body t token = 2170 match token with 2171 | Token.Comment data -> 2172 insert_comment_to_document t data 2173 | Token.Doctype _ -> 2174 process_in_body t token 2175 | Token.Character data when is_whitespace data -> 2176 process_in_body t token 2177 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 2178 process_in_body t token 2179 | Token.EOF -> 2180 () (* Stop parsing *) 2181 | _ -> 2182 parse_error t "unexpected-token-after-after-body"; 2183 t.mode <- Parser_insertion_mode.In_body; 2184 process_token t token 2185 2186and process_after_after_frameset t token = 2187 match token with 2188 | Token.Comment data -> 2189 insert_comment_to_document t data 2190 | Token.Doctype _ -> 2191 process_in_body t token 2192 | Token.Character data -> 2193 (* Extract only whitespace characters and process using in_body rules *) 2194 let whitespace = String.to_seq data 2195 |> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' ']) 2196 |> String.of_seq in 2197 if whitespace <> "" then process_in_body t (Token.Character whitespace); 2198 if not (is_whitespace data) then 2199 parse_error t "unexpected-char-after-after-frameset" 2200 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 2201 process_in_body t token 2202 | Token.EOF -> 2203 () (* Stop parsing *) 2204 | Token.Tag { kind = Token.Start; name = "noframes"; _ } -> 2205 process_in_head t token 2206 | _ -> 2207 parse_error t "unexpected-token-after-after-frameset" 2208 2209and process_token t token = 2210 (* Check for HTML integration points (SVG foreignObject, desc, title) *) 2211 let is_html_integration_point node = 2212 (* SVG foreignObject, desc, and title are always HTML integration points *) 2213 if node.Dom.namespace = Some "svg" && 2214 Parser_constants.is_svg_html_integration node.Dom.name then true 2215 (* annotation-xml is an HTML integration point only with specific encoding values *) 2216 else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then 2217 match List.assoc_opt "encoding" node.Dom.attrs with 2218 | Some enc -> 2219 let enc_lower = String.lowercase_ascii enc in 2220 enc_lower = "text/html" || enc_lower = "application/xhtml+xml" 2221 | None -> false 2222 else false 2223 in 2224 (* Check for MathML text integration points *) 2225 let is_mathml_text_integration_point node = 2226 node.Dom.namespace = Some "mathml" && 2227 Parser_constants.is_mathml_text_integration node.Dom.name 2228 in 2229 (* Foreign content handling *) 2230 let in_foreign = 2231 match adjusted_current_node t with 2232 | None -> false 2233 | Some node -> 2234 if is_in_html_namespace node then false 2235 else begin 2236 (* At HTML integration points, characters and start tags (except mglyph/malignmark) use HTML rules *) 2237 if is_html_integration_point node then begin 2238 match token with 2239 | Token.Character _ -> false 2240 | Token.Tag { kind = Token.Start; _ } -> false 2241 | _ -> true 2242 end 2243 (* At MathML text integration points, characters and start tags (except mglyph/malignmark) use HTML rules *) 2244 else if is_mathml_text_integration_point node then begin 2245 match token with 2246 | Token.Character _ -> false 2247 | Token.Tag { kind = Token.Start; name; _ } -> 2248 name = "mglyph" || name = "malignmark" 2249 | _ -> true 2250 end 2251 (* Special case: <svg> inside annotation-xml uses HTML rules (creates svg in svg namespace) *) 2252 else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then begin 2253 match token with 2254 | Token.Tag { kind = Token.Start; name; _ } when String.lowercase_ascii name = "svg" -> false 2255 | _ -> true 2256 end 2257 (* Not at integration point - use foreign content rules *) 2258 (* Breakout handling is done inside process_foreign_content *) 2259 else true 2260 end 2261 in 2262 2263 (* Check if at HTML integration point for special table mode handling *) 2264 let at_integration_point = 2265 match adjusted_current_node t with 2266 | Some node -> 2267 is_html_integration_point node || is_mathml_text_integration_point node 2268 | None -> false 2269 in 2270 2271 if in_foreign then 2272 process_foreign_content t token 2273 else if at_integration_point then begin 2274 (* At integration points, check if in table mode without table in scope *) 2275 let is_table_mode = List.mem t.mode [In_table; In_table_body; In_row; In_cell; In_caption; In_column_group] in 2276 let has_table = has_element_in_table_scope t "table" in 2277 if is_table_mode && not has_table then begin 2278 match token with 2279 | Token.Tag { kind = Token.Start; _ } -> 2280 (* Temporarily use IN_BODY for start tags in table mode without table *) 2281 let saved_mode = t.mode in 2282 t.mode <- In_body; 2283 process_by_mode t token; 2284 if t.mode = In_body then t.mode <- saved_mode 2285 | _ -> process_by_mode t token 2286 end else 2287 process_by_mode t token 2288 end else 2289 process_by_mode t token 2290 2291(* Pop foreign elements until HTML or integration point *) 2292and pop_until_html_or_integration_point t = 2293 let is_html_integration_point node = 2294 (* SVG foreignObject, desc, and title are always HTML integration points *) 2295 if node.Dom.namespace = Some "svg" && 2296 Parser_constants.is_svg_html_integration node.Dom.name then true 2297 (* annotation-xml is an HTML integration point only with specific encoding values *) 2298 else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then 2299 match List.assoc_opt "encoding" node.Dom.attrs with 2300 | Some enc -> 2301 let enc_lower = String.lowercase_ascii enc in 2302 enc_lower = "text/html" || enc_lower = "application/xhtml+xml" 2303 | None -> false 2304 else false 2305 in 2306 (* Get fragment context element - only for foreign namespace fragment contexts *) 2307 let fragment_context_elem = t.fragment_context_element in 2308 let rec pop () = 2309 match current_node t with 2310 | None -> () 2311 | Some node -> 2312 if is_in_html_namespace node then () 2313 else if is_html_integration_point node then () 2314 (* Don't pop past fragment context element *) 2315 else (match fragment_context_elem with 2316 | Some ctx when node == ctx -> () 2317 | _ -> 2318 pop_current t; 2319 pop ()) 2320 in 2321 pop () 2322 2323(* Foreign breakout elements - these break out of foreign content *) 2324and is_foreign_breakout_element name = 2325 List.mem (String.lowercase_ascii name) 2326 ["b"; "big"; "blockquote"; "body"; "br"; "center"; "code"; "dd"; "div"; "dl"; "dt"; 2327 "em"; "embed"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "head"; "hr"; "i"; "img"; "li"; 2328 "listing"; "menu"; "meta"; "nobr"; "ol"; "p"; "pre"; "ruby"; "s"; "small"; "span"; 2329 "strong"; "strike"; "sub"; "sup"; "table"; "tt"; "u"; "ul"; "var"] 2330 2331and process_foreign_content t token = 2332 match token with 2333 | Token.Character data when String.contains data '\x00' -> 2334 (* Replace NUL characters with U+FFFD replacement character *) 2335 parse_error t "unexpected-null-character"; 2336 let buf = Buffer.create (String.length data) in 2337 let has_non_ws_non_nul = ref false in 2338 String.iter (fun c -> 2339 if c = '\x00' then Buffer.add_string buf "\xEF\xBF\xBD" 2340 else begin 2341 Buffer.add_char buf c; 2342 if not (c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r') then 2343 has_non_ws_non_nul := true 2344 end 2345 ) data; 2346 let replaced = Buffer.contents buf in 2347 insert_character t replaced; 2348 (* Only set frameset_ok to false if there's actual non-whitespace non-NUL content *) 2349 if !has_non_ws_non_nul then t.frameset_ok <- false 2350 | Token.Character data when is_whitespace data -> 2351 insert_character t data 2352 | Token.Character data -> 2353 insert_character t data; 2354 t.frameset_ok <- false 2355 | Token.Comment data -> 2356 insert_comment t data 2357 | Token.Doctype _ -> 2358 parse_error t "unexpected-doctype" 2359 | Token.Tag { kind = Token.Start; name; _ } when is_foreign_breakout_element name -> 2360 (* Breakout from foreign content - pop until HTML or integration point, reprocess in HTML mode *) 2361 parse_error t "unexpected-html-element-in-foreign-content"; 2362 pop_until_html_or_integration_point t; 2363 reset_insertion_mode t; 2364 (* Use process_by_mode to force HTML mode processing and avoid infinite loop *) 2365 process_by_mode t token 2366 | Token.Tag { kind = Token.Start; name = "font"; attrs; _ } 2367 when List.exists (fun (n, _) -> 2368 let n = String.lowercase_ascii n in 2369 n = "color" || n = "face" || n = "size") attrs -> 2370 (* font with color/face/size breaks out of foreign content *) 2371 parse_error t "unexpected-html-element-in-foreign-content"; 2372 pop_until_html_or_integration_point t; 2373 reset_insertion_mode t; 2374 process_by_mode t token 2375 | Token.Tag { kind = Token.Start; name; attrs; self_closing } -> 2376 let name = 2377 match adjusted_current_node t with 2378 | Some n when n.Dom.namespace = Some "svg" -> Parser_constants.adjust_svg_tag_name name 2379 | _ -> name 2380 in 2381 let attrs = 2382 match adjusted_current_node t with 2383 | Some n when n.Dom.namespace = Some "svg" -> 2384 Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs attrs) 2385 | Some n when n.Dom.namespace = Some "mathml" -> 2386 Parser_constants.adjust_mathml_attrs (Parser_constants.adjust_foreign_attrs attrs) 2387 | _ -> Parser_constants.adjust_foreign_attrs attrs 2388 in 2389 let namespace = 2390 match adjusted_current_node t with 2391 | Some n -> n.Dom.namespace 2392 | None -> None 2393 in 2394 let node = insert_element t name ~namespace attrs in 2395 t.open_elements <- node :: t.open_elements; 2396 if self_closing then pop_current t 2397 | Token.Tag { kind = Token.End; name; _ } when List.mem (String.lowercase_ascii name) ["br"; "p"] -> 2398 (* Special case: </br> and </p> end tags trigger breakout from foreign content *) 2399 parse_error t "unexpected-html-element-in-foreign-content"; 2400 pop_until_html_or_integration_point t; 2401 reset_insertion_mode t; 2402 (* Use process_by_mode to force HTML mode processing and avoid infinite loop *) 2403 process_by_mode t token 2404 | Token.Tag { kind = Token.End; name; _ } -> 2405 (* Find matching element per WHATWG spec for foreign content *) 2406 let is_fragment_context n = 2407 match t.fragment_context_element with 2408 | Some ctx -> n == ctx 2409 | None -> false 2410 in 2411 let name_lower = String.lowercase_ascii name in 2412 (* Walk through stack looking for matching element *) 2413 let rec find_and_process first_node idx = function 2414 | [] -> () (* Stack exhausted - ignore tag *) 2415 | n :: rest -> 2416 let node_name_lower = String.lowercase_ascii n.Dom.name in 2417 let is_html = is_in_html_namespace n in 2418 let name_matches = node_name_lower = name_lower in 2419 2420 (* If first node doesn't match tag name, it's a parse error *) 2421 if first_node && not name_matches then 2422 parse_error t "unexpected-end-tag-in-foreign-content"; 2423 2424 (* Check if this node matches the end tag *) 2425 if name_matches then begin 2426 (* Fragment context check *) 2427 if is_fragment_context n then 2428 parse_error t "unexpected-end-tag-in-fragment-context" 2429 (* If matched element is in HTML namespace, reprocess via HTML mode *) 2430 else if is_html then 2431 process_by_mode t token 2432 (* Otherwise it's a foreign element - pop everything from this point up *) 2433 else begin 2434 (* Pop all elements from current down to and including the matched element *) 2435 let rec pop_to_idx current_idx = 2436 if current_idx >= idx then begin 2437 pop_current t; 2438 pop_to_idx (current_idx - 1) 2439 end 2440 in 2441 pop_to_idx (List.length t.open_elements - 1) 2442 end 2443 end 2444 (* If we hit an HTML element that doesn't match, process via HTML mode *) 2445 else if is_html then 2446 process_by_mode t token 2447 (* Continue searching in the stack *) 2448 else 2449 find_and_process false (idx - 1) rest 2450 in 2451 find_and_process true (List.length t.open_elements - 1) t.open_elements 2452 | Token.EOF -> 2453 process_by_mode t token 2454 2455and process_by_mode t token = 2456 match t.mode with 2457 | Parser_insertion_mode.Initial -> process_initial t token 2458 | Parser_insertion_mode.Before_html -> process_before_html t token 2459 | Parser_insertion_mode.Before_head -> process_before_head t token 2460 | Parser_insertion_mode.In_head -> process_in_head t token 2461 | Parser_insertion_mode.In_head_noscript -> process_in_head_noscript t token 2462 | Parser_insertion_mode.After_head -> process_after_head t token 2463 | Parser_insertion_mode.In_body -> process_in_body t token 2464 | Parser_insertion_mode.Text -> process_text t token 2465 | Parser_insertion_mode.In_table -> process_in_table t token 2466 | Parser_insertion_mode.In_table_text -> process_in_table_text t token 2467 | Parser_insertion_mode.In_caption -> process_in_caption t token 2468 | Parser_insertion_mode.In_column_group -> process_in_column_group t token 2469 | Parser_insertion_mode.In_table_body -> process_in_table_body t token 2470 | Parser_insertion_mode.In_row -> process_in_row t token 2471 | Parser_insertion_mode.In_cell -> process_in_cell t token 2472 | Parser_insertion_mode.In_select -> process_in_select t token 2473 | Parser_insertion_mode.In_select_in_table -> process_in_select_in_table t token 2474 | Parser_insertion_mode.In_template -> process_in_template t token 2475 | Parser_insertion_mode.After_body -> process_after_body t token 2476 | Parser_insertion_mode.In_frameset -> process_in_frameset t token 2477 | Parser_insertion_mode.After_frameset -> process_after_frameset t token 2478 | Parser_insertion_mode.After_after_body -> process_after_after_body t token 2479 | Parser_insertion_mode.After_after_frameset -> process_after_after_frameset t token 2480 2481(* Populate selectedcontent elements with content from selected option *) 2482let find_elements name node = 2483 let result = ref [] in 2484 let rec find n = 2485 if n.Dom.name = name then result := n :: !result; 2486 List.iter find n.Dom.children 2487 in 2488 find node; 2489 List.rev !result (* Reverse to maintain document order *) 2490 2491let find_element name node = 2492 let rec find n = 2493 if n.Dom.name = name then Some n 2494 else 2495 List.find_map find n.Dom.children 2496 in 2497 find node 2498 2499let populate_selectedcontent document = 2500 let selects = find_elements "select" document in 2501 List.iter (fun select -> 2502 match find_element "selectedcontent" select with 2503 | None -> () 2504 | Some selectedcontent -> 2505 let options = find_elements "option" select in 2506 if options <> [] then begin 2507 (* Find selected option or use first *) 2508 let selected_option = 2509 match List.find_opt (fun opt -> Dom.has_attr opt "selected") options with 2510 | Some opt -> opt 2511 | None -> List.hd options 2512 in 2513 (* Clone children from selected option to selectedcontent *) 2514 List.iter (fun child -> 2515 let cloned = Dom.clone ~deep:true child in 2516 Dom.append_child selectedcontent cloned 2517 ) selected_option.Dom.children 2518 end 2519 ) selects 2520 2521let finish t = 2522 (* Populate selectedcontent elements *) 2523 populate_selectedcontent t.document; 2524 (* For fragment parsing, remove the html wrapper and promote children *) 2525 if t.fragment_context <> None then begin 2526 match t.document.Dom.children with 2527 | [root] when root.Dom.name = "html" -> 2528 (* Move context element's children to root if applicable *) 2529 (match t.fragment_context_element with 2530 | Some ctx_elem -> 2531 (match ctx_elem.Dom.parent with 2532 | Some p when p == root -> 2533 let ctx_children = ctx_elem.Dom.children in 2534 List.iter (fun child -> 2535 Dom.remove_child ctx_elem child; 2536 Dom.append_child root child 2537 ) ctx_children; 2538 Dom.remove_child root ctx_elem 2539 | _ -> ()) 2540 | None -> ()); 2541 (* Promote root's children to document - preserve order *) 2542 let children_copy = root.Dom.children in 2543 List.iter (fun child -> 2544 Dom.remove_child root child; 2545 Dom.append_child t.document child 2546 ) children_copy; 2547 Dom.remove_child t.document root 2548 | _ -> () 2549 end; 2550 t.document 2551 2552let get_errors t = List.rev t.errors