OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* HTML5 Tree Builder *)
2
3module Dom = Dom
4module Token = Tokenizer_token
5module State = Tokenizer_state
6
7type fragment_context = {
8 tag_name : string;
9 namespace : string option;
10}
11
12type formatting_entry =
13 | Marker
14 | Entry of {
15 name : string;
16 attrs : (string * string) list;
17 node : Dom.node;
18 }
19
20type parse_error = {
21 code : Parse_error_code.t;
22 line : int;
23 column : int;
24}
25
26type t = {
27 mutable document : Dom.node;
28 mutable mode : Parser_insertion_mode.t;
29 mutable original_mode : Parser_insertion_mode.t option;
30 mutable open_elements : Dom.node list;
31 mutable active_formatting : formatting_entry list;
32 mutable head_element : Dom.node option;
33 mutable form_element : Dom.node option;
34 mutable frameset_ok : bool;
35 mutable ignore_lf : bool;
36 mutable foster_parenting : bool;
37 mutable pending_table_chars : string list;
38 mutable template_modes : Parser_insertion_mode.t list;
39 mutable quirks_mode : Dom.quirks_mode;
40 mutable errors : parse_error list;
41 collect_errors : bool;
42 fragment_context : fragment_context option;
43 mutable fragment_context_element : Dom.node option;
44 iframe_srcdoc : bool;
45 mutable current_line : int;
46 mutable current_column : int;
47}
48
49let create ?(collect_errors=false) ?fragment_context ?(iframe_srcdoc=false) () =
50 let is_fragment = fragment_context <> None in
51 let doc = if is_fragment then Dom.create_document_fragment () else Dom.create_document () in
52 let t = {
53 document = doc;
54 mode = Parser_insertion_mode.Initial;
55 original_mode = None;
56 open_elements = [];
57 active_formatting = [];
58 head_element = None;
59 form_element = None;
60 frameset_ok = true;
61 ignore_lf = false;
62 foster_parenting = false;
63 pending_table_chars = [];
64 template_modes = [];
65 quirks_mode = Dom.No_quirks;
66 errors = [];
67 collect_errors;
68 fragment_context;
69 fragment_context_element = None;
70 iframe_srcdoc;
71 current_line = 1;
72 current_column = 1;
73 } in
74 (* Initialize fragment parsing *)
75 (match fragment_context with
76 | Some ctx ->
77 let name = String.lowercase_ascii ctx.tag_name in
78 let ns = ctx.namespace in
79 (* Create html root *)
80 let root = Dom.create_element "html" () in
81 Dom.append_child doc root;
82 t.open_elements <- [root];
83 (* For foreign content contexts, create context element *)
84 (match ns with
85 | Some namespace when namespace <> "html" ->
86 let context_elem = Dom.create_element ctx.tag_name ~namespace:ns () in
87 Dom.append_child root context_elem;
88 t.open_elements <- [context_elem; root];
89 t.fragment_context_element <- Some context_elem
90 | _ -> ());
91 (* Set initial mode based on context *)
92 t.mode <- (
93 if name = "html" then Parser_insertion_mode.Before_head
94 else if Parser_constants.is_table_section_element name && (ns = None || ns = Some "html") then
95 Parser_insertion_mode.In_table_body
96 else if name = "tr" && (ns = None || ns = Some "html") then
97 Parser_insertion_mode.In_row
98 else if Parser_constants.is_table_cell_element name && (ns = None || ns = Some "html") then
99 Parser_insertion_mode.In_cell
100 else if name = "caption" && (ns = None || ns = Some "html") then
101 Parser_insertion_mode.In_caption
102 else if name = "colgroup" && (ns = None || ns = Some "html") then
103 Parser_insertion_mode.In_column_group
104 else if name = "table" && (ns = None || ns = Some "html") then
105 Parser_insertion_mode.In_table
106 else if name = "template" && (ns = None || ns = Some "html") then begin
107 t.template_modes <- [Parser_insertion_mode.In_template];
108 Parser_insertion_mode.In_template
109 end
110 else
111 Parser_insertion_mode.In_body
112 );
113 t.frameset_ok <- false
114 | None -> ());
115 t
116
117(* Position tracking for error reporting *)
118let set_position t ~line ~column =
119 t.current_line <- line;
120 t.current_column <- column
121
122(* Error handling *)
123let parse_error t code =
124 if t.collect_errors then
125 t.errors <- { code = Parse_error_code.of_string code; line = t.current_line; column = t.current_column } :: t.errors
126
127(* Stack helpers *)
128let current_node t =
129 match t.open_elements with
130 | [] -> None
131 | x :: _ -> Some x
132
133let adjusted_current_node t =
134 match t.fragment_context, t.open_elements with
135 | Some ctx, [_] ->
136 (* Fragment case: use context element info *)
137 Some (Dom.create_element ctx.tag_name ~namespace:ctx.namespace ())
138 | _, x :: _ -> Some x
139 | _, [] -> None
140
141let is_in_html_namespace node =
142 node.Dom.namespace = None || node.Dom.namespace = Some "html"
143
144(* Namespace-aware check for "special" elements per WHATWG spec *)
145let is_special_element node =
146 let name = String.lowercase_ascii node.Dom.name in
147 match node.Dom.namespace with
148 | None | Some "html" -> Parser_constants.is_special name
149 | Some "mathml" -> List.mem name ["mi"; "mo"; "mn"; "ms"; "mtext"; "annotation-xml"]
150 | Some "svg" -> List.mem name ["foreignobject"; "desc"; "title"]
151 | _ -> false
152
153let adjusted_current_node_in_html_namespace t =
154 match adjusted_current_node t with
155 | Some node -> is_in_html_namespace node
156 | None -> true
157
158(* Insertion helpers *)
159let appropriate_insertion_place t =
160 match current_node t with
161 | None -> (t.document, None)
162 | Some target ->
163 if t.foster_parenting && Parser_constants.is_foster_parenting_element target.Dom.name then begin
164 (* Foster parenting per WHATWG spec *)
165 (* Step 1: Find last (most recent) template and table in stack *)
166 (* Note: index 0 = top of stack = most recently added *)
167 let last_template_idx = ref None in
168 let last_table_idx = ref None in
169 List.iteri (fun i n ->
170 (* Take first match (most recent = lowest index) *)
171 if n.Dom.name = "template" && !last_template_idx = None then last_template_idx := Some i;
172 if n.Dom.name = "table" && !last_table_idx = None then last_table_idx := Some i
173 ) t.open_elements;
174
175 (* Step 2-3: If last template is more recent than last table (lower index = more recent) *)
176 match !last_template_idx, !last_table_idx with
177 | Some ti, None ->
178 (* No table, use template content *)
179 let template = List.nth t.open_elements ti in
180 (match template.Dom.template_content with
181 | Some tc -> (tc, None)
182 | None -> (template, None))
183 | Some ti, Some tbi when ti < tbi ->
184 (* Template is more recent than table, use template content *)
185 let template = List.nth t.open_elements ti in
186 (match template.Dom.template_content with
187 | Some tc -> (tc, None)
188 | None -> (template, None))
189 | _, Some tbi ->
190 (* Use table's parent as foster parent *)
191 let table = List.nth t.open_elements tbi in
192 (match table.Dom.parent with
193 | Some parent -> (parent, Some table)
194 | None ->
195 (* Step 6: element above table in stack (index + 1 since 0 is top) *)
196 if tbi + 1 < List.length t.open_elements then
197 (List.nth t.open_elements (tbi + 1), None)
198 else
199 (t.document, None))
200 | None, None ->
201 (* No table or template, use document *)
202 (t.document, None)
203 end else begin
204 (* If target is a template, insert into its content document fragment *)
205 match target.Dom.template_content with
206 | Some tc -> (tc, None)
207 | None -> (target, None)
208 end
209
210let insert_element t name ?(namespace=None) ?(push=false) attrs =
211 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in
212 let node = Dom.create_element name ~namespace ~attrs ~location () in
213 let (parent, before) = appropriate_insertion_place t in
214 (match before with
215 | None -> Dom.append_child parent node
216 | Some ref -> Dom.insert_before parent node ref);
217 if push then t.open_elements <- node :: t.open_elements;
218 node
219
220let insert_element_for_token t (tag : Token.tag) =
221 insert_element t tag.name ~push:true tag.attrs
222
223let insert_foreign_element t (tag : Token.tag) namespace =
224 let attrs =
225 if namespace = Some "svg" then
226 Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs tag.attrs)
227 else
228 Parser_constants.adjust_foreign_attrs tag.attrs
229 in
230 let name =
231 if namespace = Some "svg" then Parser_constants.adjust_svg_tag_name tag.name
232 else tag.name
233 in
234 let node = insert_element t name ~namespace attrs in
235 t.open_elements <- node :: t.open_elements;
236 node
237
238let insert_character t data =
239 if t.ignore_lf && String.length data > 0 && data.[0] = '\n' then begin
240 t.ignore_lf <- false;
241 if String.length data > 1 then begin
242 let rest = String.sub data 1 (String.length data - 1) in
243 let (parent, before) = appropriate_insertion_place t in
244 Dom.insert_text_at parent rest before
245 end
246 end else begin
247 t.ignore_lf <- false;
248 let (parent, before) = appropriate_insertion_place t in
249 Dom.insert_text_at parent data before
250 end
251
252let insert_comment t data =
253 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in
254 let node = Dom.create_comment ~location data in
255 let (parent, _) = appropriate_insertion_place t in
256 Dom.append_child parent node
257
258let insert_comment_to_document t data =
259 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in
260 let node = Dom.create_comment ~location data in
261 Dom.append_child t.document node
262
263(* Stack manipulation *)
264let pop_current t =
265 match t.open_elements with
266 | [] -> ()
267 | _ :: rest -> t.open_elements <- rest
268
269let pop_until t pred =
270 let rec loop () =
271 match t.open_elements with
272 | [] -> ()
273 | x :: rest ->
274 t.open_elements <- rest;
275 if not (pred x) then loop ()
276 in
277 loop ()
278
279let pop_until_tag t name =
280 pop_until t (fun n -> n.Dom.name = name)
281
282(* Pop until HTML namespace element with given name *)
283let pop_until_html_tag t name =
284 pop_until t (fun n -> n.Dom.name = name && is_in_html_namespace n)
285
286let pop_until_one_of t names =
287 pop_until t (fun n -> List.mem n.Dom.name names)
288
289(* Pop until HTML namespace element with one of given names *)
290let pop_until_html_one_of t names =
291 pop_until t (fun n -> List.mem n.Dom.name names && is_in_html_namespace n)
292
293(* Check if element is an HTML integration point *)
294let is_html_integration_point node =
295 (* SVG foreignObject, desc, and title are always HTML integration points *)
296 if node.Dom.namespace = Some "svg" &&
297 Parser_constants.is_svg_html_integration node.Dom.name then true
298 (* annotation-xml is an HTML integration point only with specific encoding values *)
299 else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then
300 match List.assoc_opt "encoding" node.Dom.attrs with
301 | Some enc ->
302 let enc_lower = String.lowercase_ascii enc in
303 enc_lower = "text/html" || enc_lower = "application/xhtml+xml"
304 | None -> false
305 else false
306
307(* Check if element is a MathML text integration point *)
308let is_mathml_text_integration_point node =
309 node.Dom.namespace = Some "mathml" &&
310 Parser_constants.is_mathml_text_integration node.Dom.name
311
312(* Scope checks - integration points also terminate scope (except for table scope) *)
313(* Per WHATWG spec, scope checks only consider HTML namespace elements for the target names *)
314let has_element_in_scope_impl t names exclude_list ~check_integration_points =
315 let rec check = function
316 | [] -> false
317 | n :: rest ->
318 (* Target elements must be in HTML namespace *)
319 if is_in_html_namespace n && List.mem n.Dom.name names then true
320 else if is_in_html_namespace n && List.mem n.Dom.name exclude_list then false
321 (* Integration points terminate scope (unless we're checking table scope) *)
322 else if check_integration_points && (is_html_integration_point n || is_mathml_text_integration_point n) then false
323 else check rest
324 in
325 check t.open_elements
326
327let has_element_in_scope t name =
328 has_element_in_scope_impl t [name] Parser_constants.default_scope ~check_integration_points:true
329
330let has_element_in_button_scope t name =
331 has_element_in_scope_impl t [name] Parser_constants.button_scope ~check_integration_points:true
332
333let has_element_in_list_item_scope t name =
334 has_element_in_scope_impl t [name] Parser_constants.list_item_scope ~check_integration_points:true
335
336let has_element_in_table_scope t name =
337 has_element_in_scope_impl t [name] Parser_constants.table_scope ~check_integration_points:false
338
339let has_element_in_select_scope t name =
340 let rec check = function
341 | [] -> false
342 | n :: rest ->
343 if n.Dom.name = name then true
344 else if not (Parser_constants.is_select_scope_exclude n.Dom.name) then false
345 else check rest
346 in
347 check t.open_elements
348
349(* Implied end tags *)
350let generate_implied_end_tags t ?except () =
351 let rec loop () =
352 match current_node t with
353 | Some n when Parser_constants.is_implied_end_tag n.Dom.name ->
354 (match except with
355 | Some ex when n.Dom.name = ex -> ()
356 | _ -> pop_current t; loop ())
357 | _ -> ()
358 in
359 loop ()
360
361let generate_all_implied_end_tags t =
362 let rec loop () =
363 match current_node t with
364 | Some n when Parser_constants.is_thoroughly_implied_end_tag n.Dom.name ->
365 pop_current t; loop ()
366 | _ -> ()
367 in
368 loop ()
369
370(* Active formatting elements *)
371let push_formatting_marker t =
372 t.active_formatting <- Marker :: t.active_formatting
373
374let push_formatting_element t node name attrs =
375 (* Noah's Ark: remove earlier identical elements (up to 3) *)
376 let rec count_and_remove same acc = function
377 | [] -> List.rev acc
378 | Marker :: rest -> List.rev acc @ (Marker :: rest)
379 | Entry e :: rest when e.name = name && e.attrs = attrs ->
380 if same >= 2 then
381 count_and_remove same acc rest (* Remove this one *)
382 else
383 count_and_remove (same + 1) (Entry e :: acc) rest
384 | x :: rest -> count_and_remove same (x :: acc) rest
385 in
386 t.active_formatting <- count_and_remove 0 [] t.active_formatting;
387 t.active_formatting <- Entry { name; attrs; node } :: t.active_formatting
388
389let clear_active_formatting_to_marker t =
390 let rec loop = function
391 | [] -> []
392 | Marker :: rest -> rest
393 | _ :: rest -> loop rest
394 in
395 t.active_formatting <- loop t.active_formatting
396
397let reconstruct_active_formatting t =
398 let rec find_to_reconstruct acc = function
399 | [] -> acc
400 | Marker :: _ -> acc
401 | Entry e :: rest ->
402 if List.exists (fun n -> n == e.node) t.open_elements then acc
403 else find_to_reconstruct (Entry e :: acc) rest
404 in
405 let to_reconstruct = find_to_reconstruct [] t.active_formatting in
406 List.iter (fun entry ->
407 match entry with
408 | Entry e ->
409 let node = insert_element t e.name e.attrs in
410 t.open_elements <- node :: t.open_elements;
411 (* Update the entry to point to new node *)
412 t.active_formatting <- List.map (fun x ->
413 if x == entry then Entry { e with node }
414 else x
415 ) t.active_formatting
416 | Marker -> ()
417 ) to_reconstruct
418
419(* Adoption agency algorithm - follows WHATWG spec *)
420let adoption_agency t tag_name =
421 (* Step 1: If current node is subject and not in active formatting list, just pop *)
422 (match current_node t with
423 | Some n when n.Dom.name = tag_name ->
424 let in_active = List.exists (function
425 | Entry e -> e.name = tag_name
426 | Marker -> false
427 ) t.active_formatting in
428 if not in_active then begin
429 pop_current t;
430 () (* Return early - this case is handled *)
431 end
432 | _ -> ());
433
434 (* Step 2: Outer loop *)
435 let outer_loop_counter = ref 0 in
436 let done_flag = ref false in
437
438 while !outer_loop_counter < 8 && not !done_flag do
439 incr outer_loop_counter;
440
441 (* Step 3: Find formatting element in active formatting list *)
442 let rec find_formatting_index idx = function
443 | [] -> None
444 | Marker :: _ -> None
445 | Entry e :: rest ->
446 if e.name = tag_name then Some (idx, e.node, e.attrs)
447 else find_formatting_index (idx + 1) rest
448 in
449 let formatting_entry = find_formatting_index 0 t.active_formatting in
450
451 match formatting_entry with
452 | None ->
453 (* No formatting element found - done *)
454 done_flag := true
455 | Some (fmt_idx, fmt_node, fmt_attrs) ->
456
457 (* Step 4: Check if formatting element is in open elements *)
458 if not (List.exists (fun n -> n == fmt_node) t.open_elements) then begin
459 parse_error t "adoption-agency-1.2";
460 t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting;
461 done_flag := true
462 end
463 (* Step 5: Check if formatting element is in scope *)
464 else if not (has_element_in_scope t tag_name) then begin
465 parse_error t "adoption-agency-1.3";
466 done_flag := true
467 end else begin
468 (* Step 6: Parse error if not current node *)
469 (match current_node t with
470 | Some n when n != fmt_node -> parse_error t "adoption-agency-1.3"
471 | _ -> ());
472
473 (* Step 7: Find furthest block - first special element BELOW formatting element *)
474 (* open_elements is [current(top)...html(bottom)], formatting element is somewhere in the middle *)
475 (* We need the first special element going from formatting element toward current *)
476 (* This is the "topmost" (closest to formatting element) special element that is "lower" (closer to current) *)
477 let fmt_stack_idx = ref (-1) in
478 List.iteri (fun i n -> if n == fmt_node then fmt_stack_idx := i) t.open_elements;
479 let furthest_block =
480 if !fmt_stack_idx <= 0 then None
481 else begin
482 (* Look from fmt_stack_idx-1 down to 0, find first special element *)
483 let rec find_from_idx idx =
484 if idx < 0 then None
485 else
486 let n = List.nth t.open_elements idx in
487 if is_special_element n then Some n
488 else find_from_idx (idx - 1)
489 in
490 find_from_idx (!fmt_stack_idx - 1)
491 end
492 in
493
494 match furthest_block with
495 | None ->
496 (* Step 8: No furthest block - pop elements including formatting element *)
497 pop_until t (fun n -> n == fmt_node);
498 t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting;
499 done_flag := true
500
501 | Some fb ->
502 (* Step 9: Let common ancestor be element immediately above formatting element *)
503 let rec find_common_ancestor = function
504 | [] -> None
505 | n :: rest when n == fmt_node ->
506 (match rest with x :: _ -> Some x | [] -> None)
507 | _ :: rest -> find_common_ancestor rest
508 in
509 let common_ancestor = find_common_ancestor t.open_elements in
510
511 (* Step 10: Bookmark starts after formatting element *)
512 let bookmark = ref (fmt_idx + 1) in
513
514 (* Step 11: Let last_node = furthest block *)
515 let last_node = ref fb in
516
517 (* Step 12: Inner loop *)
518 (* The inner loop processes elements between furthest_block and formatting_element,
519 removing non-formatting elements and reparenting formatting elements *)
520 let inner_loop_counter = ref 0 in
521
522 (* Get index of furthest block in open elements *)
523 let fb_idx = ref 0 in
524 List.iteri (fun i n -> if n == fb then fb_idx := i) t.open_elements;
525
526 (* Start from element after furthest block (toward formatting element) *)
527 let node_idx = ref (!fb_idx + 1) in
528
529 while !node_idx < List.length t.open_elements &&
530 (List.nth t.open_elements !node_idx) != fmt_node do
531 incr inner_loop_counter;
532 let current_node = List.nth t.open_elements !node_idx in
533
534 (* Step 12.3: Find node in active formatting list *)
535 let rec find_node_in_formatting idx = function
536 | [] -> None
537 | Entry e :: _rest when e.node == current_node -> Some idx
538 | _ :: rest -> find_node_in_formatting (idx + 1) rest
539 in
540 let node_fmt_idx = find_node_in_formatting 0 t.active_formatting in
541
542 (* Step 12.4: If inner loop counter > 3 and node in active formatting, remove it *)
543 let node_fmt_idx =
544 match node_fmt_idx with
545 | Some idx when !inner_loop_counter > 3 ->
546 t.active_formatting <- List.filteri (fun i _ -> i <> idx) t.active_formatting;
547 if idx < !bookmark then decr bookmark;
548 None
549 | x -> x
550 in
551
552 (* Step 12.5: If node not in active formatting, remove from stack and continue *)
553 match node_fmt_idx with
554 | None ->
555 (* Remove from stack - this shifts indices *)
556 t.open_elements <- List.filteri (fun i _ -> i <> !node_idx) t.open_elements
557 (* Don't increment node_idx since we removed an element *)
558
559 | Some af_idx ->
560 (* Step 12.6: Create new element for node *)
561 let (node_name, node_attrs) = match List.nth t.active_formatting af_idx with
562 | Entry e -> (e.name, e.attrs)
563 | Marker -> failwith "unexpected marker"
564 in
565 let new_node_elem = Dom.create_element node_name ~attrs:node_attrs () in
566
567 (* Update active formatting with new node *)
568 t.active_formatting <- List.mapi (fun i entry ->
569 if i = af_idx then Entry { name = node_name; node = new_node_elem; attrs = node_attrs }
570 else entry
571 ) t.active_formatting;
572
573 (* Replace node in open elements *)
574 t.open_elements <- List.mapi (fun i n ->
575 if i = !node_idx then new_node_elem else n
576 ) t.open_elements;
577
578 (* Step 12.7: If last_node is furthest block, update bookmark *)
579 if !last_node == fb then
580 bookmark := af_idx + 1;
581
582 (* Step 12.8: Reparent last_node to new node *)
583 (match !last_node.Dom.parent with
584 | Some p -> Dom.remove_child p !last_node
585 | None -> ());
586 Dom.append_child new_node_elem !last_node;
587
588 (* Step 12.9: Let last_node = new node *)
589 last_node := new_node_elem;
590
591 (* Move to next element *)
592 incr node_idx
593 done;
594
595 (* Step 13: Insert last_node into common ancestor *)
596 (match common_ancestor with
597 | Some ca ->
598 (match !last_node.Dom.parent with
599 | Some p -> Dom.remove_child p !last_node
600 | None -> ());
601 (* Check if we need foster parenting *)
602 if t.foster_parenting && Parser_constants.is_foster_parenting_element ca.Dom.name then begin
603 (* Find table and insert before it *)
604 let rec find_table = function
605 | [] -> None
606 | n :: rest when n.Dom.name = "table" -> Some (n, rest)
607 | _ :: rest -> find_table rest
608 in
609 match find_table t.open_elements with
610 | Some (table, _) ->
611 (match table.Dom.parent with
612 | Some parent -> Dom.insert_before parent !last_node table
613 | None -> Dom.append_child ca !last_node)
614 | None -> Dom.append_child ca !last_node
615 end else begin
616 (* If common ancestor is template, insert into its content *)
617 match ca.Dom.template_content with
618 | Some tc -> Dom.append_child tc !last_node
619 | None -> Dom.append_child ca !last_node
620 end
621 | None -> ());
622
623 (* Step 14: Create new formatting element *)
624 let new_formatting = Dom.create_element tag_name ~attrs:fmt_attrs () in
625
626 (* Step 15: Move children of furthest block to new formatting element *)
627 let fb_children = fb.Dom.children in
628 List.iter (fun child ->
629 Dom.remove_child fb child;
630 Dom.append_child new_formatting child
631 ) fb_children;
632
633 (* Step 16: Append new formatting element to furthest block *)
634 Dom.append_child fb new_formatting;
635
636 (* Step 17: Remove old from active formatting, insert new at bookmark *)
637 let new_entry = Entry { name = tag_name; node = new_formatting; attrs = fmt_attrs } in
638 t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting;
639 (* Adjust bookmark since we removed an element *)
640 let adjusted_bookmark = if fmt_idx < !bookmark then !bookmark - 1 else !bookmark in
641 let rec insert_at_bookmark idx acc = function
642 | [] -> List.rev (new_entry :: acc)
643 | x :: rest when idx = adjusted_bookmark ->
644 List.rev_append acc (new_entry :: x :: rest)
645 | x :: rest -> insert_at_bookmark (idx + 1) (x :: acc) rest
646 in
647 t.active_formatting <- insert_at_bookmark 0 [] t.active_formatting;
648
649 (* Step 18: Remove formatting element from open elements, insert new after furthest block *)
650 (* "After" in stack terms means new_formatting should be between fb and current node *)
651 (* In our list orientation (current at index 0), this means new_formatting at lower index than fb *)
652 t.open_elements <- List.filter (fun n -> n != fmt_node) t.open_elements;
653 (* Find fb and insert new_formatting before it (lower index = closer to current) *)
654 let rec insert_before acc = function
655 | [] -> List.rev (new_formatting :: acc)
656 | n :: rest when n == fb ->
657 (* Insert new_formatting before fb: acc reversed, then new_formatting, then fb, then rest *)
658 List.rev_append acc (new_formatting :: n :: rest)
659 | n :: rest -> insert_before (n :: acc) rest
660 in
661 t.open_elements <- insert_before [] t.open_elements
662 (* Continue outer loop *)
663 end
664 done
665
666(* Close p element *)
667let close_p_element t =
668 generate_implied_end_tags t ~except:"p" ();
669 (match current_node t with
670 | Some n when n.Dom.name <> "p" -> parse_error t "end-tag-p-implied-but-open-elements"
671 | _ -> ());
672 pop_until_tag t "p"
673
674(* Reset insertion mode *)
675let reset_insertion_mode t =
676 let rec check_node last = function
677 | [] -> t.mode <- Parser_insertion_mode.In_body
678 | node :: rest ->
679 let is_last = rest = [] in
680 let node_to_check =
681 if is_last then
682 match t.fragment_context with
683 | Some ctx -> Dom.create_element ctx.tag_name ~namespace:ctx.namespace ()
684 | None -> node
685 else node
686 in
687 let name = node_to_check.Dom.name in
688 if name = "select" then begin
689 if not is_last then begin
690 let rec find_table_or_template = function
691 | [] -> ()
692 | n :: rest ->
693 if n.Dom.name = "template" then t.mode <- Parser_insertion_mode.In_select
694 else if n.Dom.name = "table" then t.mode <- Parser_insertion_mode.In_select_in_table
695 else find_table_or_template rest
696 in
697 find_table_or_template rest
698 end;
699 if t.mode <> Parser_insertion_mode.In_select_in_table then
700 t.mode <- Parser_insertion_mode.In_select
701 end else if Parser_constants.is_table_cell_element name && not is_last then
702 t.mode <- Parser_insertion_mode.In_cell
703 else if name = "tr" then
704 t.mode <- Parser_insertion_mode.In_row
705 else if Parser_constants.is_table_section_element name then
706 t.mode <- Parser_insertion_mode.In_table_body
707 else if name = "caption" then
708 t.mode <- Parser_insertion_mode.In_caption
709 else if name = "colgroup" then
710 t.mode <- Parser_insertion_mode.In_column_group
711 else if name = "table" then
712 t.mode <- Parser_insertion_mode.In_table
713 else if name = "template" then
714 t.mode <- (match t.template_modes with m :: _ -> m | [] -> Parser_insertion_mode.In_template)
715 else if name = "head" && not is_last then
716 t.mode <- Parser_insertion_mode.In_head
717 else if name = "body" then
718 t.mode <- Parser_insertion_mode.In_body
719 else if name = "frameset" then
720 t.mode <- Parser_insertion_mode.In_frameset
721 else if name = "html" then
722 t.mode <- (if t.head_element = None then Parser_insertion_mode.Before_head else Parser_insertion_mode.After_head)
723 else if is_last then
724 t.mode <- Parser_insertion_mode.In_body
725 else
726 check_node last rest
727 in
728 check_node false t.open_elements
729
730let is_whitespace s =
731 let ws = [' '; '\t'; '\n'; '\x0C'; '\r'] in
732 String.for_all (fun c -> List.mem c ws) s
733
734(* Mode handlers *)
735let rec process_initial t token =
736 match token with
737 | Token.Character data when is_whitespace data -> ()
738 | Token.Comment data -> insert_comment_to_document t data
739 | Token.Doctype dt ->
740 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in
741 let node = Dom.create_doctype ?name:dt.name ?public_id:dt.public_id ?system_id:dt.system_id ~location () in
742 Dom.append_child t.document node;
743 (* Quirks mode detection *)
744 if dt.force_quirks then
745 t.quirks_mode <- Dom.Quirks
746 else if dt.name <> Some "html" then
747 t.quirks_mode <- Dom.Quirks
748 else begin
749 let pub = Option.map String.lowercase_ascii dt.public_id in
750 let sys = Option.map String.lowercase_ascii dt.system_id in
751 let is_quirky =
752 (match pub with
753 | Some p -> List.mem p Parser_constants.quirky_public_matches ||
754 List.exists (fun prefix -> String.length p >= String.length prefix &&
755 String.sub p 0 (String.length prefix) = prefix) Parser_constants.quirky_public_prefixes
756 | None -> false) ||
757 (match sys with
758 | Some s -> List.mem s Parser_constants.quirky_system_matches
759 | None -> false)
760 in
761 if is_quirky then t.quirks_mode <- Dom.Quirks
762 else begin
763 let is_limited_quirky =
764 match pub with
765 | Some p -> List.exists (fun prefix -> String.length p >= String.length prefix &&
766 String.sub p 0 (String.length prefix) = prefix)
767 Parser_constants.limited_quirky_public_prefixes
768 | None -> false
769 in
770 if is_limited_quirky then t.quirks_mode <- Dom.Limited_quirks
771 end
772 end;
773 t.mode <- Parser_insertion_mode.Before_html
774 | _ ->
775 parse_error t "expected-doctype-but-got-other";
776 t.quirks_mode <- Dom.Quirks;
777 t.mode <- Parser_insertion_mode.Before_html;
778 process_token t token
779
780and process_before_html t token =
781 match token with
782 | Token.Doctype _ -> parse_error t "unexpected-doctype"
783 | Token.Comment data -> insert_comment_to_document t data
784 | Token.Character data when is_whitespace data -> ()
785 | Token.Tag { kind = Token.Start; name = "html"; attrs; _ } ->
786 let html = insert_element t "html" attrs in
787 t.open_elements <- [html];
788 t.mode <- Parser_insertion_mode.Before_head
789 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["head"; "body"; "html"; "br"] ->
790 let html = insert_element t "html" [] in
791 t.open_elements <- [html];
792 t.mode <- Parser_insertion_mode.Before_head;
793 process_token t token
794 | Token.Tag { kind = Token.End; name; _ } ->
795 parse_error t ("unexpected-end-tag:" ^ name)
796 | _ ->
797 let html = insert_element t "html" [] in
798 t.open_elements <- [html];
799 t.mode <- Parser_insertion_mode.Before_head;
800 process_token t token
801
802and process_before_head t token =
803 match token with
804 | Token.Character data when is_whitespace data -> ()
805 | Token.Comment data -> insert_comment t data
806 | Token.Doctype _ -> parse_error t "unexpected-doctype"
807 | Token.Tag { kind = Token.Start; name = "html"; _ } ->
808 process_in_body t token
809 | Token.Tag { kind = Token.Start; name = "head"; attrs; _ } ->
810 let head = insert_element t "head" attrs in
811 t.open_elements <- head :: t.open_elements;
812 t.head_element <- Some head;
813 t.mode <- Parser_insertion_mode.In_head
814 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["head"; "body"; "html"; "br"] ->
815 let head = insert_element t "head" [] in
816 t.open_elements <- head :: t.open_elements;
817 t.head_element <- Some head;
818 t.mode <- Parser_insertion_mode.In_head;
819 process_token t token
820 | Token.Tag { kind = Token.End; name; _ } ->
821 parse_error t ("unexpected-end-tag:" ^ name)
822 | _ ->
823 let head = insert_element t "head" [] in
824 t.open_elements <- head :: t.open_elements;
825 t.head_element <- Some head;
826 t.mode <- Parser_insertion_mode.In_head;
827 process_token t token
828
829and process_in_head t token =
830 match token with
831 | Token.Character data when is_whitespace data ->
832 insert_character t data
833 | Token.Character data ->
834 (* Extract leading whitespace *)
835 let rec count_leading_ws i =
836 if i >= String.length data then i
837 else match data.[i] with
838 | '\t' | '\n' | '\x0C' | '\r' | ' ' -> count_leading_ws (i + 1)
839 | _ -> i
840 in
841 let ws_count = count_leading_ws 0 in
842 let leading_ws = String.sub data 0 ws_count in
843 let remaining = String.sub data ws_count (String.length data - ws_count) in
844 (* If there's leading whitespace and current element has children, insert it *)
845 if leading_ws <> "" then
846 (match current_node t with
847 | Some n when n.Dom.children <> [] -> insert_character t leading_ws
848 | _ -> ());
849 pop_current t;
850 t.mode <- Parser_insertion_mode.After_head;
851 process_token t (Token.Character remaining)
852 | Token.Comment data ->
853 insert_comment t data
854 | Token.Doctype _ ->
855 parse_error t "unexpected-doctype"
856 | Token.Tag { kind = Token.Start; name = "html"; _ } ->
857 process_in_body t token
858 | Token.Tag { kind = Token.Start; name; attrs; _ }
859 when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"] ->
860 ignore (insert_element t name attrs)
861 | Token.Tag { kind = Token.Start; name = "title"; attrs; self_closing } ->
862 ignore (insert_element_for_token t { kind = Token.Start; name = "title"; attrs; self_closing });
863 t.original_mode <- Some t.mode;
864 t.mode <- Parser_insertion_mode.Text
865 | Token.Tag { kind = Token.Start; name; attrs; self_closing }
866 when List.mem name ["noframes"; "style"] ->
867 ignore (insert_element_for_token t { kind = Token.Start; name; attrs; self_closing });
868 t.original_mode <- Some t.mode;
869 t.mode <- Parser_insertion_mode.Text
870 | Token.Tag { kind = Token.Start; name = "noscript"; attrs; self_closing } ->
871 (* Scripting is disabled: parse noscript content as HTML *)
872 ignore (insert_element_for_token t { kind = Token.Start; name = "noscript"; attrs; self_closing });
873 t.mode <- Parser_insertion_mode.In_head_noscript
874 | Token.Tag { kind = Token.Start; name = "script"; attrs; self_closing } ->
875 ignore (insert_element_for_token t { kind = Token.Start; name = "script"; attrs; self_closing });
876 t.original_mode <- Some t.mode;
877 t.mode <- Parser_insertion_mode.Text
878 | Token.Tag { kind = Token.End; name = "head"; _ } ->
879 pop_current t;
880 t.mode <- Parser_insertion_mode.After_head
881 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "html"; "br"] ->
882 pop_current t;
883 t.mode <- Parser_insertion_mode.After_head;
884 process_token t token
885 | Token.Tag { kind = Token.Start; name = "template"; attrs; _ } ->
886 let node = Dom.create_template ~attrs () in
887 let (parent, _) = appropriate_insertion_place t in
888 Dom.append_child parent node;
889 t.open_elements <- node :: t.open_elements;
890 push_formatting_marker t;
891 t.frameset_ok <- false;
892 t.mode <- Parser_insertion_mode.In_template;
893 t.template_modes <- Parser_insertion_mode.In_template :: t.template_modes
894 | Token.Tag { kind = Token.End; name = "template"; _ } ->
895 if not (List.exists (fun n -> n.Dom.name = "template" && is_in_html_namespace n) t.open_elements) then
896 parse_error t "unexpected-end-tag"
897 else begin
898 generate_all_implied_end_tags t;
899 (match current_node t with
900 | Some n when not (n.Dom.name = "template" && is_in_html_namespace n) -> parse_error t "unexpected-end-tag"
901 | _ -> ());
902 pop_until_html_tag t "template";
903 clear_active_formatting_to_marker t;
904 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
905 reset_insertion_mode t
906 end
907 | Token.Tag { kind = Token.Start; name = "head"; _ } ->
908 parse_error t "unexpected-start-tag"
909 | Token.Tag { kind = Token.End; name; _ } ->
910 parse_error t ("unexpected-end-tag:" ^ name)
911 | _ ->
912 pop_current t;
913 t.mode <- Parser_insertion_mode.After_head;
914 process_token t token
915
916and process_in_head_noscript t token =
917 match token with
918 | Token.Character data when is_whitespace data ->
919 process_in_head t token
920 | Token.Character _ ->
921 parse_error t "unexpected-char-in-noscript";
922 pop_current t; (* Pop noscript *)
923 t.mode <- Parser_insertion_mode.In_head;
924 process_token t token
925 | Token.Comment _ ->
926 process_in_head t token
927 | Token.Doctype _ ->
928 parse_error t "unexpected-doctype"
929 | Token.Tag { kind = Token.Start; name = "html"; _ } ->
930 process_in_body t token
931 | Token.Tag { kind = Token.Start; name; _ }
932 when List.mem name ["basefont"; "bgsound"; "link"; "meta"; "noframes"; "style"] ->
933 process_in_head t token
934 | Token.Tag { kind = Token.Start; name; _ }
935 when List.mem name ["head"; "noscript"] ->
936 parse_error t "unexpected-start-tag"
937 | Token.Tag { kind = Token.Start; name; _ } ->
938 parse_error t ("bad-start-tag-in-head-noscript:" ^ name);
939 pop_current t; (* Pop noscript *)
940 t.mode <- Parser_insertion_mode.In_head;
941 process_token t token
942 | Token.Tag { kind = Token.End; name = "noscript"; _ } ->
943 pop_current t; (* Pop noscript *)
944 t.mode <- Parser_insertion_mode.In_head
945 | Token.Tag { kind = Token.End; name = "br"; _ } ->
946 parse_error t "unexpected-end-tag";
947 pop_current t; (* Pop noscript *)
948 t.mode <- Parser_insertion_mode.In_head;
949 process_token t token
950 | Token.Tag { kind = Token.End; name; _ } ->
951 parse_error t ("unexpected-end-tag:" ^ name)
952 | Token.EOF ->
953 parse_error t "expected-closing-tag-but-got-eof";
954 pop_current t; (* Pop noscript *)
955 t.mode <- Parser_insertion_mode.In_head;
956 process_token t token
957
958and process_after_head t token =
959 match token with
960 | Token.Character data when is_whitespace data ->
961 insert_character t data
962 | Token.Comment data ->
963 insert_comment t data
964 | Token.Doctype _ ->
965 parse_error t "unexpected-doctype"
966 | Token.Tag { kind = Token.Start; name = "html"; _ } ->
967 process_in_body t token
968 | Token.Tag { kind = Token.Start; name = "body"; attrs; _ } ->
969 ignore (insert_element t "body" ~push:true attrs);
970 t.frameset_ok <- false;
971 t.mode <- Parser_insertion_mode.In_body
972 | Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } ->
973 ignore (insert_element t "frameset" ~push:true attrs);
974 t.mode <- Parser_insertion_mode.In_frameset
975 | Token.Tag { kind = Token.Start; name = "input"; attrs; _ } ->
976 (* Special handling for input type="hidden" - parse error, ignore *)
977 let input_type = List.assoc_opt "type" attrs in
978 (match input_type with
979 | Some typ when String.lowercase_ascii typ = "hidden" ->
980 parse_error t "unexpected-hidden-input-after-head"
981 | _ ->
982 (* Non-hidden input creates body *)
983 let body = insert_element t "body" [] in
984 t.open_elements <- body :: t.open_elements;
985 t.mode <- Parser_insertion_mode.In_body;
986 process_token t token)
987 | Token.Tag { kind = Token.Start; name; _ }
988 when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] ->
989 parse_error t "unexpected-start-tag";
990 (match t.head_element with
991 | Some head ->
992 t.open_elements <- head :: t.open_elements;
993 process_in_head t token;
994 t.open_elements <- List.filter (fun n -> n != head) t.open_elements
995 | None -> ())
996 | Token.Tag { kind = Token.End; name = "template"; _ } ->
997 process_in_head t token
998 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "html"; "br"] ->
999 let body = insert_element t "body" [] in
1000 t.open_elements <- body :: t.open_elements;
1001 t.mode <- Parser_insertion_mode.In_body;
1002 process_token t token
1003 | Token.Tag { kind = Token.Start; name = "head"; _ } ->
1004 parse_error t "unexpected-start-tag"
1005 | Token.Tag { kind = Token.End; name; _ } ->
1006 parse_error t ("unexpected-end-tag:" ^ name)
1007 | _ ->
1008 let body = insert_element t "body" [] in
1009 t.open_elements <- body :: t.open_elements;
1010 t.mode <- Parser_insertion_mode.In_body;
1011 process_token t token
1012
1013and process_in_body t token =
1014 match token with
1015 | Token.Character "\x00" ->
1016 parse_error t "unexpected-null-character"
1017 | Token.Character data ->
1018 reconstruct_active_formatting t;
1019 insert_character t data;
1020 if not (is_whitespace data) then t.frameset_ok <- false
1021 | Token.Comment data ->
1022 insert_comment t data
1023 | Token.Doctype _ ->
1024 parse_error t "unexpected-doctype"
1025 | Token.Tag { kind = Token.Start; name = "html"; attrs; _ } ->
1026 parse_error t "unexpected-start-tag";
1027 if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then
1028 (* Find the html element (at the bottom of the stack) *)
1029 let html_elem = List.find_opt (fun n -> n.Dom.name = "html") t.open_elements in
1030 (match html_elem with
1031 | Some html ->
1032 List.iter (fun (k, v) ->
1033 if not (Dom.has_attr html k) then Dom.set_attr html k v
1034 ) attrs
1035 | None -> ())
1036 | Token.Tag { kind = Token.Start; name; _ }
1037 when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] ->
1038 process_in_head t token
1039 | Token.Tag { kind = Token.End; name = "template"; _ } ->
1040 process_in_head t token
1041 | Token.Tag { kind = Token.Start; name = "body"; attrs; _ } ->
1042 parse_error t "unexpected-start-tag";
1043 (* Find body element on stack - it should be near the end (html is last) *)
1044 let body = List.find_opt (fun n -> n.Dom.name = "body") t.open_elements in
1045 (match body with
1046 | Some body when not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) ->
1047 t.frameset_ok <- false;
1048 List.iter (fun (k, v) ->
1049 if not (Dom.has_attr body k) then Dom.set_attr body k v
1050 ) attrs
1051 | _ -> ())
1052 | Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } ->
1053 if not t.frameset_ok then
1054 parse_error t "unexpected-start-tag-ignored"
1055 else begin
1056 (* Find body element on the stack *)
1057 let rec find_body_index idx = function
1058 | [] -> None
1059 | n :: rest ->
1060 if n.Dom.name = "body" then Some (idx, n)
1061 else find_body_index (idx + 1) rest
1062 in
1063 match find_body_index 0 t.open_elements with
1064 | None ->
1065 parse_error t "unexpected-start-tag-ignored"
1066 | Some (idx, body_elem) ->
1067 (* Remove body from its parent (the html element) *)
1068 (match body_elem.Dom.parent with
1069 | Some parent -> Dom.remove_child parent body_elem
1070 | None -> ());
1071 (* Pop all elements up to and including body - keep only elements after body_idx *)
1072 let rec drop n lst = if n <= 0 then lst else match lst with [] -> [] | _ :: rest -> drop (n - 1) rest in
1073 t.open_elements <- drop (idx + 1) t.open_elements;
1074 (* Insert frameset element *)
1075 ignore (insert_element t "frameset" ~push:true attrs);
1076 t.mode <- Parser_insertion_mode.In_frameset
1077 end
1078 | Token.EOF ->
1079 if t.template_modes <> [] then
1080 process_in_template t token
1081 else begin
1082 let has_unclosed = List.exists (fun n ->
1083 not (List.mem n.Dom.name ["dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"; "body"; "html"])
1084 ) t.open_elements in
1085 if has_unclosed then parse_error t "expected-closing-tag-but-got-eof"
1086 end
1087 | Token.Tag { kind = Token.End; name = "body"; _ } ->
1088 if not (has_element_in_scope t "body") then
1089 parse_error t "unexpected-end-tag"
1090 else begin
1091 let has_unclosed = List.exists (fun n ->
1092 not (List.mem n.Dom.name ["dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"; "body"; "html"])
1093 ) t.open_elements in
1094 if has_unclosed then parse_error t "end-tag-too-early";
1095 t.mode <- Parser_insertion_mode.After_body
1096 end
1097 | Token.Tag { kind = Token.End; name = "html"; _ } ->
1098 if not (has_element_in_scope t "body") then
1099 parse_error t "unexpected-end-tag"
1100 else begin
1101 t.mode <- Parser_insertion_mode.After_body;
1102 process_token t token
1103 end
1104 | Token.Tag { kind = Token.Start; name; attrs; _ }
1105 when List.mem name ["address"; "article"; "aside"; "blockquote"; "center"; "details"; "dialog"; "dir"; "div"; "dl"; "fieldset"; "figcaption"; "figure"; "footer"; "header"; "hgroup"; "main"; "menu"; "nav"; "ol"; "p"; "search"; "section"; "summary"; "ul"] ->
1106 if has_element_in_button_scope t "p" then close_p_element t;
1107 ignore (insert_element t name ~push:true attrs)
1108 | Token.Tag { kind = Token.Start; name; attrs; _ } when Parser_constants.is_heading_element name ->
1109 if has_element_in_button_scope t "p" then close_p_element t;
1110 (match current_node t with
1111 | Some n when Parser_constants.is_heading_element n.Dom.name ->
1112 parse_error t "unexpected-start-tag";
1113 pop_current t
1114 | _ -> ());
1115 ignore (insert_element t name ~push:true attrs)
1116 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["pre"; "listing"] ->
1117 if has_element_in_button_scope t "p" then close_p_element t;
1118 ignore (insert_element t name ~push:true attrs);
1119 t.ignore_lf <- true;
1120 t.frameset_ok <- false
1121 | Token.Tag { kind = Token.Start; name = "form"; attrs; _ } ->
1122 if t.form_element <> None && not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then
1123 parse_error t "unexpected-start-tag"
1124 else begin
1125 if has_element_in_button_scope t "p" then close_p_element t;
1126 let form = insert_element t "form" attrs in
1127 t.open_elements <- form :: t.open_elements;
1128 if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then
1129 t.form_element <- Some form
1130 end
1131 | Token.Tag { kind = Token.Start; name = "li"; attrs; _ } ->
1132 t.frameset_ok <- false;
1133 let rec check = function
1134 | [] -> ()
1135 | n :: rest ->
1136 if n.Dom.name = "li" then begin
1137 generate_implied_end_tags t ~except:"li" ();
1138 (match current_node t with
1139 | Some cn when cn.Dom.name <> "li" -> parse_error t "unexpected-start-tag"
1140 | _ -> ());
1141 pop_until_tag t "li"
1142 end else if is_special_element n && not (List.mem (String.lowercase_ascii n.Dom.name) ["address"; "div"; "p"]) then
1143 ()
1144 else
1145 check rest
1146 in
1147 check t.open_elements;
1148 if has_element_in_button_scope t "p" then close_p_element t;
1149 ignore (insert_element t "li" ~push:true attrs)
1150 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["dd"; "dt"] ->
1151 t.frameset_ok <- false;
1152 let rec check = function
1153 | [] -> ()
1154 | n :: rest ->
1155 if List.mem n.Dom.name ["dd"; "dt"] then begin
1156 generate_implied_end_tags t ~except:n.Dom.name ();
1157 (match current_node t with
1158 | Some cn when cn.Dom.name <> n.Dom.name -> parse_error t "unexpected-start-tag"
1159 | _ -> ());
1160 pop_until_one_of t ["dd"; "dt"]
1161 end else if is_special_element n && not (List.mem (String.lowercase_ascii n.Dom.name) ["address"; "div"; "p"]) then
1162 ()
1163 else
1164 check rest
1165 in
1166 check t.open_elements;
1167 if has_element_in_button_scope t "p" then close_p_element t;
1168 ignore (insert_element t name ~push:true attrs)
1169 | Token.Tag { kind = Token.Start; name = "plaintext"; _ } ->
1170 if has_element_in_button_scope t "p" then close_p_element t;
1171 ignore (insert_element t "plaintext" ~push:true [])
1172 (* Tokenizer should switch to PLAINTEXT state *)
1173 | Token.Tag { kind = Token.Start; name = "button"; attrs; _ } ->
1174 if has_element_in_scope t "button" then begin
1175 parse_error t "unexpected-start-tag";
1176 generate_implied_end_tags t ();
1177 pop_until_tag t "button"
1178 end;
1179 reconstruct_active_formatting t;
1180 ignore (insert_element t "button" ~push:true attrs);
1181 t.frameset_ok <- false
1182 | Token.Tag { kind = Token.End; name; _ }
1183 when List.mem name ["address"; "article"; "aside"; "blockquote"; "button"; "center"; "details"; "dialog"; "dir"; "div"; "dl"; "fieldset"; "figcaption"; "figure"; "footer"; "header"; "hgroup"; "listing"; "main"; "menu"; "nav"; "ol"; "pre"; "search"; "section"; "summary"; "ul"] ->
1184 if not (has_element_in_scope t name) then
1185 parse_error t ("unexpected-end-tag:" ^ name)
1186 else begin
1187 generate_implied_end_tags t ();
1188 (match current_node t with
1189 | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
1190 | _ -> ());
1191 pop_until_tag t name
1192 end
1193 | Token.Tag { kind = Token.End; name = "form"; _ } ->
1194 if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then begin
1195 let node = t.form_element in
1196 t.form_element <- None;
1197 match node with
1198 | None -> parse_error t "unexpected-end-tag"
1199 | Some form_node ->
1200 if not (has_element_in_scope t "form") then
1201 parse_error t "unexpected-end-tag"
1202 else begin
1203 generate_implied_end_tags t ();
1204 (match current_node t with
1205 | Some n when n != form_node -> parse_error t "end-tag-too-early"
1206 | _ -> ());
1207 t.open_elements <- List.filter (fun n -> n != form_node) t.open_elements
1208 end
1209 end else begin
1210 if not (has_element_in_scope t "form") then
1211 parse_error t "unexpected-end-tag"
1212 else begin
1213 generate_implied_end_tags t ();
1214 (match current_node t with
1215 | Some n when n.Dom.name <> "form" -> parse_error t "end-tag-too-early"
1216 | _ -> ());
1217 pop_until_tag t "form"
1218 end
1219 end
1220 | Token.Tag { kind = Token.End; name = "p"; _ } ->
1221 if not (has_element_in_button_scope t "p") then begin
1222 parse_error t "no-p-element-in-scope";
1223 ignore (insert_element t "p" ~push:true [])
1224 end;
1225 close_p_element t
1226 | Token.Tag { kind = Token.End; name = "li"; _ } ->
1227 if not (has_element_in_list_item_scope t "li") then
1228 parse_error t "unexpected-end-tag"
1229 else begin
1230 generate_implied_end_tags t ~except:"li" ();
1231 (match current_node t with
1232 | Some n when n.Dom.name <> "li" -> parse_error t "end-tag-too-early"
1233 | _ -> ());
1234 pop_until_tag t "li"
1235 end
1236 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["dd"; "dt"] ->
1237 if not (has_element_in_scope t name) then
1238 parse_error t "unexpected-end-tag"
1239 else begin
1240 generate_implied_end_tags t ~except:name ();
1241 (match current_node t with
1242 | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
1243 | _ -> ());
1244 pop_until_tag t name
1245 end
1246 | Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_heading_element name ->
1247 if not (has_element_in_scope_impl t Parser_constants.heading_elements Parser_constants.default_scope ~check_integration_points:true) then
1248 parse_error t "unexpected-end-tag"
1249 else begin
1250 generate_implied_end_tags t ();
1251 (match current_node t with
1252 | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
1253 | _ -> ());
1254 pop_until_one_of t Parser_constants.heading_elements
1255 end
1256 | Token.Tag { kind = Token.Start; name = "a"; attrs; _ } ->
1257 (* Check for existing <a> in active formatting *)
1258 let rec find_a = function
1259 | [] -> None
1260 | Marker :: _ -> None
1261 | Entry e :: _ when e.name = "a" -> Some e.node
1262 | _ :: rest -> find_a rest
1263 in
1264 (match find_a t.active_formatting with
1265 | Some existing ->
1266 parse_error t "unexpected-start-tag";
1267 adoption_agency t "a";
1268 t.active_formatting <- List.filter (function
1269 | Entry e -> e.node != existing
1270 | _ -> true
1271 ) t.active_formatting;
1272 t.open_elements <- List.filter (fun n -> n != existing) t.open_elements
1273 | None -> ());
1274 reconstruct_active_formatting t;
1275 let node = insert_element t "a" attrs in
1276 t.open_elements <- node :: t.open_elements;
1277 push_formatting_element t node "a" attrs
1278 | Token.Tag { kind = Token.Start; name; attrs; _ }
1279 when List.mem name ["b"; "big"; "code"; "em"; "font"; "i"; "s"; "small"; "strike"; "strong"; "tt"; "u"] ->
1280 reconstruct_active_formatting t;
1281 let node = insert_element t name attrs in
1282 t.open_elements <- node :: t.open_elements;
1283 push_formatting_element t node name attrs
1284 | Token.Tag { kind = Token.Start; name = "nobr"; attrs; _ } ->
1285 if has_element_in_scope t "nobr" then begin
1286 parse_error t "unexpected-start-tag";
1287 adoption_agency t "nobr";
1288 (* Remove nobr from active formatting *)
1289 t.active_formatting <- List.filter (function
1290 | Entry e -> e.name <> "nobr"
1291 | Marker -> true
1292 ) t.active_formatting;
1293 (* Remove nobr from open elements *)
1294 t.open_elements <- List.filter (fun n -> n.Dom.name <> "nobr") t.open_elements
1295 end;
1296 reconstruct_active_formatting t;
1297 let node = insert_element t "nobr" attrs in
1298 t.open_elements <- node :: t.open_elements;
1299 push_formatting_element t node "nobr" attrs
1300 | Token.Tag { kind = Token.End; name; _ }
1301 when List.mem name ["a"; "b"; "big"; "code"; "em"; "font"; "i"; "nobr"; "s"; "small"; "strike"; "strong"; "tt"; "u"] ->
1302 adoption_agency t name
1303 | Token.Tag { kind = Token.Start; name; attrs; _ }
1304 when List.mem name ["applet"; "marquee"; "object"] ->
1305 reconstruct_active_formatting t;
1306 ignore (insert_element t name ~push:true attrs);
1307 push_formatting_marker t;
1308 t.frameset_ok <- false
1309 | Token.Tag { kind = Token.End; name; _ }
1310 when List.mem name ["applet"; "marquee"; "object"] ->
1311 if not (has_element_in_scope t name) then
1312 parse_error t "unexpected-end-tag"
1313 else begin
1314 generate_implied_end_tags t ();
1315 (match current_node t with
1316 | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
1317 | _ -> ());
1318 pop_until_tag t name;
1319 clear_active_formatting_to_marker t
1320 end
1321 | Token.Tag { kind = Token.Start; name = "table"; attrs; _ } ->
1322 if t.quirks_mode <> Dom.Quirks && has_element_in_button_scope t "p" then
1323 close_p_element t;
1324 ignore (insert_element t "table" ~push:true attrs);
1325 t.frameset_ok <- false;
1326 t.mode <- Parser_insertion_mode.In_table
1327 | Token.Tag { kind = Token.End; name = "br"; _ } ->
1328 parse_error t "end-tag-br";
1329 reconstruct_active_formatting t;
1330 ignore (insert_element t "br" ~push:true []);
1331 pop_current t;
1332 t.frameset_ok <- false
1333 | Token.Tag { kind = Token.Start; name; attrs; _ }
1334 when List.mem name ["area"; "br"; "embed"; "img"; "keygen"; "wbr"] ->
1335 reconstruct_active_formatting t;
1336 ignore (insert_element t name ~push:true attrs);
1337 pop_current t;
1338 t.frameset_ok <- false
1339 | Token.Tag { kind = Token.Start; name = "input"; attrs; _ } ->
1340 reconstruct_active_formatting t;
1341 ignore (insert_element t "input" ~push:true attrs);
1342 pop_current t;
1343 let is_hidden = List.exists (fun (k, v) ->
1344 String.lowercase_ascii k = "type" && String.lowercase_ascii v = "hidden"
1345 ) attrs in
1346 if not is_hidden then t.frameset_ok <- false
1347 | Token.Tag { kind = Token.Start; name; attrs; _ }
1348 when List.mem name ["param"; "source"; "track"] ->
1349 ignore (insert_element_for_token t { kind = Token.Start; name; attrs; self_closing = false });
1350 pop_current t
1351 | Token.Tag { kind = Token.Start; name = "hr"; _ } ->
1352 if has_element_in_button_scope t "p" then close_p_element t;
1353 ignore (insert_element t "hr" ~push:true []);
1354 pop_current t;
1355 t.frameset_ok <- false
1356 | Token.Tag { kind = Token.Start; name = "image"; attrs; _ } ->
1357 parse_error t "unexpected-start-tag";
1358 (* Treat <image> as <img> *)
1359 reconstruct_active_formatting t;
1360 ignore (insert_element t "img" ~push:true attrs);
1361 pop_current t;
1362 t.frameset_ok <- false
1363 | Token.Tag { kind = Token.Start; name = "textarea"; attrs; _ } ->
1364 ignore (insert_element t "textarea" ~push:true attrs);
1365 t.ignore_lf <- true;
1366 t.original_mode <- Some t.mode;
1367 t.frameset_ok <- false;
1368 t.mode <- Parser_insertion_mode.Text
1369 | Token.Tag { kind = Token.Start; name = "xmp"; attrs; _ } ->
1370 if has_element_in_button_scope t "p" then close_p_element t;
1371 reconstruct_active_formatting t;
1372 t.frameset_ok <- false;
1373 ignore (insert_element_for_token t { kind = Token.Start; name = "xmp"; attrs; self_closing = false });
1374 t.original_mode <- Some t.mode;
1375 t.mode <- Parser_insertion_mode.Text
1376 | Token.Tag { kind = Token.Start; name = "iframe"; attrs; _ } ->
1377 t.frameset_ok <- false;
1378 ignore (insert_element_for_token t { kind = Token.Start; name = "iframe"; attrs; self_closing = false });
1379 t.original_mode <- Some t.mode;
1380 t.mode <- Parser_insertion_mode.Text
1381 | Token.Tag { kind = Token.Start; name = "noembed"; attrs; _ } ->
1382 ignore (insert_element_for_token t { kind = Token.Start; name = "noembed"; attrs; self_closing = false });
1383 t.original_mode <- Some t.mode;
1384 t.mode <- Parser_insertion_mode.Text
1385 | Token.Tag { kind = Token.Start; name = "select"; attrs; _ } ->
1386 reconstruct_active_formatting t;
1387 ignore (insert_element t "select" ~push:true attrs);
1388 t.frameset_ok <- false;
1389 if List.mem t.mode [Parser_insertion_mode.In_table; Parser_insertion_mode.In_caption; Parser_insertion_mode.In_table_body; Parser_insertion_mode.In_row; Parser_insertion_mode.In_cell] then
1390 t.mode <- Parser_insertion_mode.In_select_in_table
1391 else
1392 t.mode <- Parser_insertion_mode.In_select
1393 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["optgroup"; "option"] ->
1394 (match current_node t with
1395 | Some n when n.Dom.name = "option" -> pop_current t
1396 | _ -> ());
1397 reconstruct_active_formatting t;
1398 ignore (insert_element t name ~push:true attrs)
1399 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["rb"; "rtc"] ->
1400 if has_element_in_scope t "ruby" then begin
1401 generate_implied_end_tags t ()
1402 end;
1403 (match current_node t with
1404 | Some n when n.Dom.name <> "ruby" && n.Dom.name <> "rtc" -> parse_error t "unexpected-start-tag"
1405 | _ -> ());
1406 ignore (insert_element t name ~push:true attrs)
1407 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["rp"; "rt"] ->
1408 if has_element_in_scope t "ruby" then begin
1409 generate_implied_end_tags t ~except:"rtc" ()
1410 end;
1411 (match current_node t with
1412 | Some n when n.Dom.name <> "ruby" && n.Dom.name <> "rtc" -> parse_error t "unexpected-start-tag"
1413 | _ -> ());
1414 ignore (insert_element t name ~push:true attrs)
1415 | Token.Tag { kind = Token.Start; name = "math"; attrs; self_closing } ->
1416 reconstruct_active_formatting t;
1417 let adjusted_attrs = Parser_constants.adjust_mathml_attrs (Parser_constants.adjust_foreign_attrs attrs) in
1418 ignore (insert_foreign_element t { kind = Token.Start; name = "math"; attrs = adjusted_attrs; self_closing } (Some "mathml"));
1419 if self_closing then pop_current t
1420 | Token.Tag { kind = Token.Start; name = "svg"; attrs; self_closing } ->
1421 reconstruct_active_formatting t;
1422 let adjusted_attrs = Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs attrs) in
1423 ignore (insert_foreign_element t { kind = Token.Start; name = "svg"; attrs = adjusted_attrs; self_closing } (Some "svg"));
1424 if self_closing then pop_current t
1425 | Token.Tag { kind = Token.Start; name; attrs; _ }
1426 when List.mem name ["col"; "frame"] ->
1427 (* In fragment context, insert these; otherwise ignore *)
1428 if t.fragment_context = None then
1429 parse_error t "unexpected-start-tag-ignored"
1430 else
1431 ignore (insert_element t name attrs)
1432 | Token.Tag { kind = Token.Start; name; _ }
1433 when List.mem name ["caption"; "colgroup"; "head"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
1434 parse_error t "unexpected-start-tag"
1435 | Token.Tag { kind = Token.Start; name; attrs; self_closing } ->
1436 (* Any other start tag *)
1437 reconstruct_active_formatting t;
1438 ignore (insert_element t name ~push:true attrs);
1439 (* Check for self-closing on non-void HTML element *)
1440 if self_closing && not (Parser_constants.is_void_element name) then
1441 parse_error t "non-void-html-element-start-tag-with-trailing-solidus"
1442 | Token.Tag { kind = Token.End; name; _ } ->
1443 (* Any other end tag *)
1444 let rec check = function
1445 | [] -> ()
1446 | node :: rest ->
1447 if node.Dom.name = name then begin
1448 generate_implied_end_tags t ~except:name ();
1449 (match current_node t with
1450 | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
1451 | _ -> ());
1452 pop_until t (fun n -> n == node)
1453 end else if is_special_element node then
1454 parse_error t ("unexpected-end-tag:" ^ name)
1455 else
1456 check rest
1457 in
1458 check t.open_elements
1459
1460and process_text t token =
1461 match token with
1462 | Token.Character data ->
1463 insert_character t data
1464 | Token.EOF ->
1465 parse_error t "expected-closing-tag-but-got-eof";
1466 pop_current t;
1467 t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_body;
1468 process_token t token
1469 | Token.Tag { kind = Token.End; _ } ->
1470 pop_current t;
1471 t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_body
1472 | _ -> ()
1473
1474and process_in_table t token =
1475 match token with
1476 | Token.Character _ when (match current_node t with Some n -> Parser_constants.is_foster_parenting_element n.Dom.name | None -> false) ->
1477 t.pending_table_chars <- [];
1478 t.original_mode <- Some t.mode;
1479 t.mode <- Parser_insertion_mode.In_table_text;
1480 process_token t token
1481 | Token.Comment data ->
1482 insert_comment t data
1483 | Token.Doctype _ ->
1484 parse_error t "unexpected-doctype"
1485 | Token.Tag { kind = Token.Start; name = "caption"; attrs; _ } ->
1486 clear_stack_back_to_table_context t;
1487 push_formatting_marker t;
1488 ignore (insert_element t "caption" ~push:true attrs);
1489 t.mode <- Parser_insertion_mode.In_caption
1490 | Token.Tag { kind = Token.Start; name = "colgroup"; attrs; _ } ->
1491 clear_stack_back_to_table_context t;
1492 ignore (insert_element t "colgroup" ~push:true attrs);
1493 t.mode <- Parser_insertion_mode.In_column_group
1494 | Token.Tag { kind = Token.Start; name = "col"; _ } ->
1495 clear_stack_back_to_table_context t;
1496 ignore (insert_element t "colgroup" ~push:true []);
1497 t.mode <- Parser_insertion_mode.In_column_group;
1498 process_token t token
1499 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["tbody"; "tfoot"; "thead"] ->
1500 clear_stack_back_to_table_context t;
1501 ignore (insert_element t name ~push:true attrs);
1502 t.mode <- Parser_insertion_mode.In_table_body
1503 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["td"; "th"; "tr"] ->
1504 clear_stack_back_to_table_context t;
1505 ignore (insert_element t "tbody" ~push:true []);
1506 t.mode <- Parser_insertion_mode.In_table_body;
1507 process_token t token
1508 | Token.Tag { kind = Token.Start; name = "table"; _ } ->
1509 parse_error t "unexpected-start-tag";
1510 if has_element_in_table_scope t "table" then begin
1511 pop_until_tag t "table";
1512 reset_insertion_mode t;
1513 process_token t token
1514 end
1515 | Token.Tag { kind = Token.End; name = "table"; _ } ->
1516 if not (has_element_in_table_scope t "table") then
1517 parse_error t "unexpected-end-tag"
1518 else begin
1519 pop_until_tag t "table";
1520 reset_insertion_mode t
1521 end
1522 | Token.Tag { kind = Token.End; name; _ }
1523 when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
1524 parse_error t "unexpected-end-tag"
1525 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["style"; "script"; "template"] ->
1526 process_in_head t token
1527 | Token.Tag { kind = Token.End; name = "template"; _ } ->
1528 process_in_head t token
1529 | Token.Tag { kind = Token.Start; name = "input"; attrs; _ } ->
1530 let is_hidden = List.exists (fun (k, v) ->
1531 String.lowercase_ascii k = "type" && String.lowercase_ascii v = "hidden"
1532 ) attrs in
1533 if not is_hidden then begin
1534 parse_error t "start-tag-in-table:input";
1535 t.foster_parenting <- true;
1536 process_in_body t token;
1537 t.foster_parenting <- false
1538 end else begin
1539 parse_error t "start-tag-in-table:input";
1540 ignore (insert_element t "input" ~push:true attrs);
1541 pop_current t
1542 end
1543 | Token.Tag { kind = Token.Start; name = "form"; attrs; _ } ->
1544 parse_error t "unexpected-start-tag";
1545 if t.form_element = None && not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then begin
1546 let form = insert_element t "form" attrs in
1547 t.open_elements <- form :: t.open_elements;
1548 t.form_element <- Some form;
1549 pop_current t
1550 end
1551 | Token.EOF ->
1552 process_in_body t token
1553 | _ ->
1554 parse_error t "unexpected-token-in-table";
1555 t.foster_parenting <- true;
1556 process_in_body t token;
1557 t.foster_parenting <- false
1558
1559and clear_stack_back_to_table_context t =
1560 let rec loop () =
1561 match current_node t with
1562 | Some n when not (List.mem n.Dom.name ["table"; "template"; "html"]) ->
1563 pop_current t;
1564 loop ()
1565 | _ -> ()
1566 in
1567 loop ()
1568
1569and process_in_table_text t token =
1570 match token with
1571 | Token.Character data ->
1572 if String.contains data '\x00' then
1573 parse_error t "unexpected-null-character"
1574 else
1575 t.pending_table_chars <- data :: t.pending_table_chars
1576 | _ ->
1577 let pending = String.concat "" (List.rev t.pending_table_chars) in
1578 t.pending_table_chars <- [];
1579 if not (is_whitespace pending) then begin
1580 parse_error t "unexpected-character-in-table";
1581 t.foster_parenting <- true;
1582 reconstruct_active_formatting t;
1583 insert_character t pending;
1584 t.foster_parenting <- false
1585 end else
1586 insert_character t pending;
1587 t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_table;
1588 process_token t token
1589
1590and process_in_caption t token =
1591 match token with
1592 | Token.Tag { kind = Token.End; name = "caption"; _ } ->
1593 if not (has_element_in_table_scope t "caption") then
1594 parse_error t "unexpected-end-tag"
1595 else begin
1596 generate_implied_end_tags t ();
1597 (match current_node t with
1598 | Some n when n.Dom.name <> "caption" -> parse_error t "end-tag-too-early"
1599 | _ -> ());
1600 pop_until_tag t "caption";
1601 clear_active_formatting_to_marker t;
1602 t.mode <- Parser_insertion_mode.In_table
1603 end
1604 | Token.Tag { kind = Token.Start; name; _ }
1605 when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
1606 if not (has_element_in_table_scope t "caption") then
1607 parse_error t "unexpected-start-tag"
1608 else begin
1609 generate_implied_end_tags t ();
1610 pop_until_tag t "caption";
1611 clear_active_formatting_to_marker t;
1612 t.mode <- Parser_insertion_mode.In_table;
1613 process_token t token
1614 end
1615 | Token.Tag { kind = Token.End; name = "table"; _ } ->
1616 if not (has_element_in_table_scope t "caption") then
1617 parse_error t "unexpected-end-tag"
1618 else begin
1619 generate_implied_end_tags t ();
1620 pop_until_tag t "caption";
1621 clear_active_formatting_to_marker t;
1622 t.mode <- Parser_insertion_mode.In_table;
1623 process_token t token
1624 end
1625 | Token.Tag { kind = Token.End; name; _ }
1626 when List.mem name ["body"; "col"; "colgroup"; "html"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
1627 parse_error t "unexpected-end-tag"
1628 | _ ->
1629 process_in_body t token
1630
1631and process_in_column_group t token =
1632 match token with
1633 | Token.Character data when is_whitespace data ->
1634 insert_character t data
1635 | Token.Character data ->
1636 (* Split leading whitespace from non-whitespace *)
1637 let ws_chars = [' '; '\t'; '\n'; '\x0C'; '\r'] in
1638 let len = String.length data in
1639 let ws_end = ref 0 in
1640 while !ws_end < len && List.mem data.[!ws_end] ws_chars do incr ws_end done;
1641 if !ws_end > 0 then
1642 insert_character t (String.sub data 0 !ws_end);
1643 if !ws_end < len then begin
1644 let remaining = String.sub data !ws_end (len - !ws_end) in
1645 (match current_node t with
1646 | Some n when n.Dom.name = "colgroup" ->
1647 pop_current t;
1648 t.mode <- Parser_insertion_mode.In_table;
1649 process_token t (Token.Character remaining)
1650 | _ ->
1651 parse_error t "unexpected-token")
1652 end
1653 | Token.Comment data ->
1654 insert_comment t data
1655 | Token.Doctype _ ->
1656 parse_error t "unexpected-doctype"
1657 | Token.Tag { kind = Token.Start; name = "html"; _ } ->
1658 process_in_body t token
1659 | Token.Tag { kind = Token.Start; name = "col"; attrs; _ } ->
1660 ignore (insert_element t "col" ~push:true attrs);
1661 pop_current t
1662 | Token.Tag { kind = Token.End; name = "colgroup"; _ } ->
1663 (match current_node t with
1664 | Some n when n.Dom.name <> "colgroup" -> parse_error t "unexpected-end-tag"
1665 | Some _ -> pop_current t; t.mode <- Parser_insertion_mode.In_table
1666 | None -> parse_error t "unexpected-end-tag")
1667 | Token.Tag { kind = Token.End; name = "col"; _ } ->
1668 parse_error t "unexpected-end-tag"
1669 | Token.Tag { kind = Token.Start; name = "template"; _ }
1670 | Token.Tag { kind = Token.End; name = "template"; _ } ->
1671 process_in_head t token
1672 | Token.EOF ->
1673 process_in_body t token
1674 | _ ->
1675 (match current_node t with
1676 | Some n when n.Dom.name = "colgroup" ->
1677 pop_current t;
1678 t.mode <- Parser_insertion_mode.In_table;
1679 process_token t token
1680 | _ ->
1681 parse_error t "unexpected-token")
1682
1683and process_in_table_body t token =
1684 match token with
1685 | Token.Tag { kind = Token.Start; name = "tr"; attrs; _ } ->
1686 clear_stack_back_to_table_body_context t;
1687 ignore (insert_element t "tr" ~push:true attrs);
1688 t.mode <- Parser_insertion_mode.In_row
1689 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["th"; "td"] ->
1690 parse_error t "unexpected-start-tag";
1691 clear_stack_back_to_table_body_context t;
1692 ignore (insert_element t "tr" ~push:true []);
1693 t.mode <- Parser_insertion_mode.In_row;
1694 process_token t token
1695 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["tbody"; "tfoot"; "thead"] ->
1696 if not (has_element_in_table_scope t name) then
1697 parse_error t "unexpected-end-tag"
1698 else begin
1699 clear_stack_back_to_table_body_context t;
1700 pop_current t;
1701 t.mode <- Parser_insertion_mode.In_table
1702 end
1703 | Token.Tag { kind = Token.Start; name; _ }
1704 when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "tfoot"; "thead"] ->
1705 if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Parser_constants.table_scope ~check_integration_points:false) then
1706 parse_error t "unexpected-start-tag"
1707 else begin
1708 clear_stack_back_to_table_body_context t;
1709 pop_current t;
1710 t.mode <- Parser_insertion_mode.In_table;
1711 process_token t token
1712 end
1713 | Token.Tag { kind = Token.End; name = "table"; _ } ->
1714 if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Parser_constants.table_scope ~check_integration_points:false) then
1715 parse_error t "unexpected-end-tag"
1716 else begin
1717 clear_stack_back_to_table_body_context t;
1718 pop_current t;
1719 t.mode <- Parser_insertion_mode.In_table;
1720 process_token t token
1721 end
1722 | Token.Tag { kind = Token.End; name; _ }
1723 when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "td"; "th"; "tr"] ->
1724 parse_error t "unexpected-end-tag"
1725 | _ ->
1726 process_in_table t token
1727
1728and clear_stack_back_to_table_body_context t =
1729 let rec loop () =
1730 match current_node t with
1731 | Some n when not (List.mem n.Dom.name ["tbody"; "tfoot"; "thead"; "template"; "html"]) ->
1732 pop_current t;
1733 loop ()
1734 | _ -> ()
1735 in
1736 loop ()
1737
1738and process_in_row t token =
1739 match token with
1740 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["th"; "td"] ->
1741 clear_stack_back_to_table_row_context t;
1742 ignore (insert_element t name ~push:true attrs);
1743 t.mode <- Parser_insertion_mode.In_cell;
1744 push_formatting_marker t
1745 | Token.Tag { kind = Token.End; name = "tr"; _ } ->
1746 if not (has_element_in_table_scope t "tr") then
1747 parse_error t "unexpected-end-tag"
1748 else begin
1749 clear_stack_back_to_table_row_context t;
1750 pop_current t;
1751 t.mode <- Parser_insertion_mode.In_table_body
1752 end
1753 | Token.Tag { kind = Token.Start; name; _ }
1754 when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "tfoot"; "thead"; "tr"] ->
1755 if not (has_element_in_table_scope t "tr") then
1756 parse_error t "unexpected-start-tag"
1757 else begin
1758 clear_stack_back_to_table_row_context t;
1759 pop_current t;
1760 t.mode <- Parser_insertion_mode.In_table_body;
1761 process_token t token
1762 end
1763 | Token.Tag { kind = Token.End; name = "table"; _ } ->
1764 if not (has_element_in_table_scope t "tr") then
1765 parse_error t "unexpected-end-tag"
1766 else begin
1767 clear_stack_back_to_table_row_context t;
1768 pop_current t;
1769 t.mode <- Parser_insertion_mode.In_table_body;
1770 process_token t token
1771 end
1772 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["tbody"; "tfoot"; "thead"] ->
1773 if not (has_element_in_table_scope t name) then
1774 parse_error t "unexpected-end-tag"
1775 else if not (has_element_in_table_scope t "tr") then
1776 parse_error t "unexpected-end-tag"
1777 else begin
1778 clear_stack_back_to_table_row_context t;
1779 pop_current t;
1780 t.mode <- Parser_insertion_mode.In_table_body;
1781 process_token t token
1782 end
1783 | Token.Tag { kind = Token.End; name; _ }
1784 when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "td"; "th"] ->
1785 parse_error t "unexpected-end-tag"
1786 | _ ->
1787 process_in_table t token
1788
1789and clear_stack_back_to_table_row_context t =
1790 let rec loop () =
1791 match current_node t with
1792 | Some n when not (List.mem n.Dom.name ["tr"; "template"; "html"]) ->
1793 pop_current t;
1794 loop ()
1795 | _ -> ()
1796 in
1797 loop ()
1798
1799and process_in_cell t token =
1800 match token with
1801 | Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_table_cell_element name ->
1802 if not (has_element_in_table_scope t name) then
1803 parse_error t "unexpected-end-tag"
1804 else begin
1805 generate_implied_end_tags t ();
1806 (match current_node t with
1807 | Some n when not (n.Dom.name = name && is_in_html_namespace n) -> parse_error t "end-tag-too-early"
1808 | _ -> ());
1809 pop_until_html_tag t name;
1810 clear_active_formatting_to_marker t;
1811 t.mode <- Parser_insertion_mode.In_row
1812 end
1813 | Token.Tag { kind = Token.Start; name; _ }
1814 when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
1815 if not (has_element_in_scope_impl t ["td"; "th"] Parser_constants.table_scope ~check_integration_points:false) then
1816 parse_error t "unexpected-start-tag"
1817 else begin
1818 close_cell t;
1819 process_token t token
1820 end
1821 | Token.Tag { kind = Token.End; name; _ }
1822 when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"] ->
1823 parse_error t "unexpected-end-tag"
1824 | Token.Tag { kind = Token.End; name; _ }
1825 when Parser_constants.is_foster_parenting_element name ->
1826 if not (has_element_in_table_scope t name) then
1827 parse_error t "unexpected-end-tag"
1828 else begin
1829 close_cell t;
1830 process_token t token
1831 end
1832 | _ ->
1833 process_in_body t token
1834
1835and close_cell t =
1836 generate_implied_end_tags t ();
1837 (match current_node t with
1838 | Some n when not (Parser_constants.is_table_cell_element n.Dom.name && is_in_html_namespace n) -> parse_error t "end-tag-too-early"
1839 | _ -> ());
1840 pop_until_html_one_of t ["td"; "th"];
1841 clear_active_formatting_to_marker t;
1842 t.mode <- Parser_insertion_mode.In_row
1843
1844and process_in_select t token =
1845 match token with
1846 | Token.Character "\x00" ->
1847 parse_error t "unexpected-null-character"
1848 | Token.Character data ->
1849 reconstruct_active_formatting t;
1850 insert_character t data
1851 | Token.Comment data ->
1852 insert_comment t data
1853 | Token.Doctype _ ->
1854 parse_error t "unexpected-doctype"
1855 | Token.Tag { kind = Token.Start; name = "html"; _ } ->
1856 process_in_body t token
1857 | Token.Tag { kind = Token.Start; name = "option"; attrs; _ } ->
1858 (match current_node t with
1859 | Some n when n.Dom.name = "option" -> pop_current t
1860 | _ -> ());
1861 reconstruct_active_formatting t;
1862 ignore (insert_element t "option" ~push:true attrs)
1863 | Token.Tag { kind = Token.Start; name = "optgroup"; attrs; _ } ->
1864 (match current_node t with
1865 | Some n when n.Dom.name = "option" -> pop_current t
1866 | _ -> ());
1867 (match current_node t with
1868 | Some n when n.Dom.name = "optgroup" -> pop_current t
1869 | _ -> ());
1870 reconstruct_active_formatting t;
1871 ignore (insert_element t "optgroup" ~push:true attrs)
1872 | Token.Tag { kind = Token.Start; name = "hr"; _ } ->
1873 (match current_node t with
1874 | Some n when n.Dom.name = "option" -> pop_current t
1875 | _ -> ());
1876 (match current_node t with
1877 | Some n when n.Dom.name = "optgroup" -> pop_current t
1878 | _ -> ());
1879 ignore (insert_element t "hr" ~push:true []);
1880 pop_current t
1881 | Token.Tag { kind = Token.End; name = "optgroup"; _ } ->
1882 (match t.open_elements with
1883 | opt :: optg :: _ when opt.Dom.name = "option" && optg.Dom.name = "optgroup" ->
1884 pop_current t
1885 | _ -> ());
1886 (match current_node t with
1887 | Some n when n.Dom.name = "optgroup" -> pop_current t
1888 | _ -> parse_error t "unexpected-end-tag")
1889 | Token.Tag { kind = Token.End; name = "option"; _ } ->
1890 (match current_node t with
1891 | Some n when n.Dom.name = "option" -> pop_current t
1892 | _ -> parse_error t "unexpected-end-tag")
1893 | Token.Tag { kind = Token.End; name = "select"; _ } ->
1894 if not (has_element_in_select_scope t "select") then
1895 parse_error t "unexpected-end-tag"
1896 else begin
1897 pop_until_tag t "select";
1898 reset_insertion_mode t
1899 end
1900 | Token.Tag { kind = Token.Start; name = "select"; _ } ->
1901 parse_error t "unexpected-start-tag";
1902 (* Per spec: in IN_SELECT mode, select is always on the stack - just pop *)
1903 pop_until_tag t "select";
1904 reset_insertion_mode t
1905 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["input"; "textarea"] ->
1906 parse_error t "unexpected-start-tag";
1907 (* Per spec: in IN_SELECT mode, select is always on the stack - just pop *)
1908 pop_until_tag t "select";
1909 reset_insertion_mode t;
1910 process_token t token
1911 | Token.Tag { kind = Token.Start; name = "plaintext"; attrs; _ } ->
1912 (* plaintext is allowed in select - creates element, parser will switch tokenizer to PLAINTEXT mode *)
1913 reconstruct_active_formatting t;
1914 ignore (insert_element t "plaintext" ~push:true attrs)
1915 | Token.Tag { kind = Token.Start; name = "menuitem"; attrs; _ } ->
1916 (* menuitem is allowed in select *)
1917 reconstruct_active_formatting t;
1918 ignore (insert_element t "menuitem" ~push:true attrs)
1919 | Token.Tag { kind = Token.Start; name = "keygen"; attrs; _ } ->
1920 (* keygen is handled specially in select - inserted directly *)
1921 reconstruct_active_formatting t;
1922 ignore (insert_element t "keygen" attrs)
1923 (* Void element, don't push to stack *)
1924 | Token.Tag { kind = Token.Start; name = "svg"; attrs; self_closing } ->
1925 reconstruct_active_formatting t;
1926 let node = insert_foreign_element t { kind = Token.Start; name = "svg"; attrs; self_closing } (Some "svg") in
1927 if not self_closing then t.open_elements <- node :: t.open_elements
1928 | Token.Tag { kind = Token.Start; name = "math"; attrs; self_closing } ->
1929 reconstruct_active_formatting t;
1930 let node = insert_foreign_element t { kind = Token.Start; name = "math"; attrs; self_closing } (Some "mathml") in
1931 if not self_closing then t.open_elements <- node :: t.open_elements
1932 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["script"; "template"] ->
1933 process_in_head t token
1934 | Token.Tag { kind = Token.End; name = "template"; _ } ->
1935 process_in_head t token
1936 (* Allow certain HTML elements in select - newer spec behavior *)
1937 | Token.Tag { kind = Token.Start; name; attrs; self_closing } when List.mem name ["p"; "div"; "span"; "button"; "datalist"; "selectedcontent"] ->
1938 reconstruct_active_formatting t;
1939 let node = insert_element t name attrs in
1940 if not self_closing then t.open_elements <- node :: t.open_elements
1941 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["br"; "img"] ->
1942 reconstruct_active_formatting t;
1943 ignore (insert_element t name attrs)
1944 (* Don't push to stack - void elements *)
1945 (* Handle formatting elements in select *)
1946 | Token.Tag { kind = Token.Start; name; attrs; _ } when Parser_constants.is_formatting_element name ->
1947 reconstruct_active_formatting t;
1948 let node = insert_element t name ~push:true attrs in
1949 push_formatting_element t node name attrs
1950 | Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_formatting_element name ->
1951 (* Find select element and check if formatting element is inside select *)
1952 let select_idx = ref None in
1953 let fmt_idx = ref None in
1954 List.iteri (fun i n ->
1955 if n.Dom.name = "select" && !select_idx = None then select_idx := Some i;
1956 if n.Dom.name = name then fmt_idx := Some i
1957 ) t.open_elements;
1958 (match !fmt_idx, !select_idx with
1959 | Some fi, Some si when fi < si ->
1960 (* Formatting element is inside select, run adoption agency *)
1961 adoption_agency t name
1962 | Some _, Some _ ->
1963 (* Formatting element is outside select boundary - parse error, ignore *)
1964 parse_error t "unexpected-end-tag"
1965 | Some _, None ->
1966 adoption_agency t name
1967 | None, _ ->
1968 parse_error t "unexpected-end-tag")
1969 (* End tags for HTML elements allowed in select *)
1970 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["p"; "div"; "span"; "button"; "datalist"; "selectedcontent"] ->
1971 (* Find select and target indices *)
1972 let select_idx = ref None in
1973 let target_idx = ref None in
1974 List.iteri (fun i n ->
1975 if n.Dom.name = "select" && !select_idx = None then select_idx := Some i;
1976 if n.Dom.name = name then target_idx := Some i
1977 ) t.open_elements;
1978 (* Only pop if target exists and is inside select (lower index = closer to current) *)
1979 (match !target_idx, !select_idx with
1980 | Some ti, Some si when ti < si ->
1981 (* Pop until we reach the target *)
1982 let rec pop_to_target () =
1983 match t.open_elements with
1984 | [] -> ()
1985 | n :: rest ->
1986 t.open_elements <- rest;
1987 if n.Dom.name <> name then pop_to_target ()
1988 in
1989 pop_to_target ()
1990 | Some _, Some _ ->
1991 parse_error t "unexpected-end-tag"
1992 | Some _, None ->
1993 (* No select on stack, just pop to target *)
1994 let rec pop_to_target () =
1995 match t.open_elements with
1996 | [] -> ()
1997 | n :: rest ->
1998 t.open_elements <- rest;
1999 if n.Dom.name <> name then pop_to_target ()
2000 in
2001 pop_to_target ()
2002 | None, _ ->
2003 parse_error t "unexpected-end-tag")
2004 | Token.EOF ->
2005 process_in_body t token
2006 | _ ->
2007 parse_error t "unexpected-token-in-select"
2008
2009and process_in_select_in_table t token =
2010 match token with
2011 | Token.Tag { kind = Token.Start; name; _ }
2012 when List.mem name ["caption"; "table"; "tbody"; "tfoot"; "thead"; "tr"; "td"; "th"] ->
2013 parse_error t "unexpected-start-tag";
2014 pop_until_tag t "select";
2015 reset_insertion_mode t;
2016 process_token t token
2017 | Token.Tag { kind = Token.End; name; _ }
2018 when List.mem name ["caption"; "table"; "tbody"; "tfoot"; "thead"; "tr"; "td"; "th"] ->
2019 parse_error t "unexpected-end-tag";
2020 if has_element_in_table_scope t name then begin
2021 pop_until_tag t "select";
2022 reset_insertion_mode t;
2023 process_token t token
2024 end
2025 | _ ->
2026 process_in_select t token
2027
2028and process_in_template t token =
2029 match token with
2030 | Token.Character _ | Token.Comment _ | Token.Doctype _ ->
2031 process_in_body t token
2032 | Token.Tag { kind = Token.Start; name; _ }
2033 when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] ->
2034 process_in_head t token
2035 | Token.Tag { kind = Token.End; name = "template"; _ } ->
2036 process_in_head t token
2037 | Token.Tag { kind = Token.Start; name; _ }
2038 when List.mem name ["caption"; "colgroup"; "tbody"; "tfoot"; "thead"] ->
2039 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2040 t.template_modes <- Parser_insertion_mode.In_table :: t.template_modes;
2041 t.mode <- Parser_insertion_mode.In_table;
2042 process_token t token
2043 | Token.Tag { kind = Token.Start; name = "col"; _ } ->
2044 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2045 t.template_modes <- Parser_insertion_mode.In_column_group :: t.template_modes;
2046 t.mode <- Parser_insertion_mode.In_column_group;
2047 process_token t token
2048 | Token.Tag { kind = Token.Start; name = "tr"; _ } ->
2049 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2050 t.template_modes <- Parser_insertion_mode.In_table_body :: t.template_modes;
2051 t.mode <- Parser_insertion_mode.In_table_body;
2052 process_token t token
2053 | Token.Tag { kind = Token.Start; name; _ } when Parser_constants.is_table_cell_element name ->
2054 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2055 t.template_modes <- Parser_insertion_mode.In_row :: t.template_modes;
2056 t.mode <- Parser_insertion_mode.In_row;
2057 process_token t token
2058 | Token.Tag { kind = Token.Start; _ } ->
2059 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2060 t.template_modes <- Parser_insertion_mode.In_body :: t.template_modes;
2061 t.mode <- Parser_insertion_mode.In_body;
2062 process_token t token
2063 | Token.Tag { kind = Token.End; name; _ } ->
2064 parse_error t ("unexpected-end-tag:" ^ name)
2065 | Token.EOF ->
2066 if not (List.exists (fun n -> n.Dom.name = "template" && is_in_html_namespace n) t.open_elements) then
2067 () (* Stop parsing *)
2068 else begin
2069 parse_error t "expected-closing-tag-but-got-eof";
2070 pop_until_html_tag t "template";
2071 clear_active_formatting_to_marker t;
2072 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2073 reset_insertion_mode t;
2074 process_token t token
2075 end
2076
2077and process_after_body t token =
2078 match token with
2079 | Token.Character data when is_whitespace data ->
2080 process_in_body t token
2081 | Token.Comment data ->
2082 (* Insert as last child of html element - html is at bottom of stack *)
2083 let html_opt = List.find_opt (fun n -> n.Dom.name = "html") t.open_elements in
2084 (match html_opt with
2085 | Some html ->
2086 let location = Dom.make_location ~line:t.current_line ~column:t.current_column () in
2087 Dom.append_child html (Dom.create_comment ~location data)
2088 | None -> ())
2089 | Token.Doctype _ ->
2090 parse_error t "unexpected-doctype"
2091 | Token.Tag { kind = Token.Start; name = "html"; _ } ->
2092 process_in_body t token
2093 | Token.Tag { kind = Token.End; name = "html"; _ } ->
2094 if t.fragment_context <> None then
2095 parse_error t "unexpected-end-tag"
2096 else
2097 t.mode <- Parser_insertion_mode.After_after_body
2098 | Token.EOF ->
2099 () (* Stop parsing *)
2100 | _ ->
2101 parse_error t "unexpected-token-after-body";
2102 t.mode <- Parser_insertion_mode.In_body;
2103 process_token t token
2104
2105and process_in_frameset t token =
2106 match token with
2107 | Token.Character data ->
2108 (* Extract only whitespace characters and insert them *)
2109 let whitespace = String.to_seq data
2110 |> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' '])
2111 |> String.of_seq in
2112 if whitespace <> "" then insert_character t whitespace;
2113 if not (is_whitespace data) then
2114 parse_error t "unexpected-char-in-frameset"
2115 | Token.Comment data ->
2116 insert_comment t data
2117 | Token.Doctype _ ->
2118 parse_error t "unexpected-doctype"
2119 | Token.Tag { kind = Token.Start; name = "html"; _ } ->
2120 process_in_body t token
2121 | Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } ->
2122 ignore (insert_element t "frameset" ~push:true attrs)
2123 | Token.Tag { kind = Token.End; name = "frameset"; _ } ->
2124 (match current_node t with
2125 | Some n when n.Dom.name = "html" -> parse_error t "unexpected-end-tag"
2126 | _ ->
2127 pop_current t;
2128 if t.fragment_context = None then
2129 (match current_node t with
2130 | Some n when n.Dom.name <> "frameset" -> t.mode <- Parser_insertion_mode.After_frameset
2131 | _ -> ()))
2132 | Token.Tag { kind = Token.Start; name = "frame"; attrs; _ } ->
2133 ignore (insert_element t "frame" ~push:true attrs);
2134 pop_current t
2135 | Token.Tag { kind = Token.Start; name = "noframes"; _ } ->
2136 process_in_head t token
2137 | Token.EOF ->
2138 (match current_node t with
2139 | Some n when n.Dom.name <> "html" -> parse_error t "expected-closing-tag-but-got-eof"
2140 | _ -> ())
2141 | _ ->
2142 parse_error t "unexpected-token-in-frameset"
2143
2144and process_after_frameset t token =
2145 match token with
2146 | Token.Character data ->
2147 (* Extract only whitespace characters and insert them *)
2148 let whitespace = String.to_seq data
2149 |> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' '])
2150 |> String.of_seq in
2151 if whitespace <> "" then insert_character t whitespace;
2152 if not (is_whitespace data) then
2153 parse_error t "unexpected-char-after-frameset"
2154 | Token.Comment data ->
2155 insert_comment t data
2156 | Token.Doctype _ ->
2157 parse_error t "unexpected-doctype"
2158 | Token.Tag { kind = Token.Start; name = "html"; _ } ->
2159 process_in_body t token
2160 | Token.Tag { kind = Token.End; name = "html"; _ } ->
2161 t.mode <- Parser_insertion_mode.After_after_frameset
2162 | Token.Tag { kind = Token.Start; name = "noframes"; _ } ->
2163 process_in_head t token
2164 | Token.EOF ->
2165 () (* Stop parsing *)
2166 | _ ->
2167 parse_error t "unexpected-token-after-frameset"
2168
2169and process_after_after_body t token =
2170 match token with
2171 | Token.Comment data ->
2172 insert_comment_to_document t data
2173 | Token.Doctype _ ->
2174 process_in_body t token
2175 | Token.Character data when is_whitespace data ->
2176 process_in_body t token
2177 | Token.Tag { kind = Token.Start; name = "html"; _ } ->
2178 process_in_body t token
2179 | Token.EOF ->
2180 () (* Stop parsing *)
2181 | _ ->
2182 parse_error t "unexpected-token-after-after-body";
2183 t.mode <- Parser_insertion_mode.In_body;
2184 process_token t token
2185
2186and process_after_after_frameset t token =
2187 match token with
2188 | Token.Comment data ->
2189 insert_comment_to_document t data
2190 | Token.Doctype _ ->
2191 process_in_body t token
2192 | Token.Character data ->
2193 (* Extract only whitespace characters and process using in_body rules *)
2194 let whitespace = String.to_seq data
2195 |> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' '])
2196 |> String.of_seq in
2197 if whitespace <> "" then process_in_body t (Token.Character whitespace);
2198 if not (is_whitespace data) then
2199 parse_error t "unexpected-char-after-after-frameset"
2200 | Token.Tag { kind = Token.Start; name = "html"; _ } ->
2201 process_in_body t token
2202 | Token.EOF ->
2203 () (* Stop parsing *)
2204 | Token.Tag { kind = Token.Start; name = "noframes"; _ } ->
2205 process_in_head t token
2206 | _ ->
2207 parse_error t "unexpected-token-after-after-frameset"
2208
2209and process_token t token =
2210 (* Check for HTML integration points (SVG foreignObject, desc, title) *)
2211 let is_html_integration_point node =
2212 (* SVG foreignObject, desc, and title are always HTML integration points *)
2213 if node.Dom.namespace = Some "svg" &&
2214 Parser_constants.is_svg_html_integration node.Dom.name then true
2215 (* annotation-xml is an HTML integration point only with specific encoding values *)
2216 else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then
2217 match List.assoc_opt "encoding" node.Dom.attrs with
2218 | Some enc ->
2219 let enc_lower = String.lowercase_ascii enc in
2220 enc_lower = "text/html" || enc_lower = "application/xhtml+xml"
2221 | None -> false
2222 else false
2223 in
2224 (* Check for MathML text integration points *)
2225 let is_mathml_text_integration_point node =
2226 node.Dom.namespace = Some "mathml" &&
2227 Parser_constants.is_mathml_text_integration node.Dom.name
2228 in
2229 (* Foreign content handling *)
2230 let in_foreign =
2231 match adjusted_current_node t with
2232 | None -> false
2233 | Some node ->
2234 if is_in_html_namespace node then false
2235 else begin
2236 (* At HTML integration points, characters and start tags (except mglyph/malignmark) use HTML rules *)
2237 if is_html_integration_point node then begin
2238 match token with
2239 | Token.Character _ -> false
2240 | Token.Tag { kind = Token.Start; _ } -> false
2241 | _ -> true
2242 end
2243 (* At MathML text integration points, characters and start tags (except mglyph/malignmark) use HTML rules *)
2244 else if is_mathml_text_integration_point node then begin
2245 match token with
2246 | Token.Character _ -> false
2247 | Token.Tag { kind = Token.Start; name; _ } ->
2248 name = "mglyph" || name = "malignmark"
2249 | _ -> true
2250 end
2251 (* Special case: <svg> inside annotation-xml uses HTML rules (creates svg in svg namespace) *)
2252 else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then begin
2253 match token with
2254 | Token.Tag { kind = Token.Start; name; _ } when String.lowercase_ascii name = "svg" -> false
2255 | _ -> true
2256 end
2257 (* Not at integration point - use foreign content rules *)
2258 (* Breakout handling is done inside process_foreign_content *)
2259 else true
2260 end
2261 in
2262
2263 (* Check if at HTML integration point for special table mode handling *)
2264 let at_integration_point =
2265 match adjusted_current_node t with
2266 | Some node ->
2267 is_html_integration_point node || is_mathml_text_integration_point node
2268 | None -> false
2269 in
2270
2271 if in_foreign then
2272 process_foreign_content t token
2273 else if at_integration_point then begin
2274 (* At integration points, check if in table mode without table in scope *)
2275 let is_table_mode = List.mem t.mode [In_table; In_table_body; In_row; In_cell; In_caption; In_column_group] in
2276 let has_table = has_element_in_table_scope t "table" in
2277 if is_table_mode && not has_table then begin
2278 match token with
2279 | Token.Tag { kind = Token.Start; _ } ->
2280 (* Temporarily use IN_BODY for start tags in table mode without table *)
2281 let saved_mode = t.mode in
2282 t.mode <- In_body;
2283 process_by_mode t token;
2284 if t.mode = In_body then t.mode <- saved_mode
2285 | _ -> process_by_mode t token
2286 end else
2287 process_by_mode t token
2288 end else
2289 process_by_mode t token
2290
2291(* Pop foreign elements until HTML or integration point *)
2292and pop_until_html_or_integration_point t =
2293 let is_html_integration_point node =
2294 (* SVG foreignObject, desc, and title are always HTML integration points *)
2295 if node.Dom.namespace = Some "svg" &&
2296 Parser_constants.is_svg_html_integration node.Dom.name then true
2297 (* annotation-xml is an HTML integration point only with specific encoding values *)
2298 else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then
2299 match List.assoc_opt "encoding" node.Dom.attrs with
2300 | Some enc ->
2301 let enc_lower = String.lowercase_ascii enc in
2302 enc_lower = "text/html" || enc_lower = "application/xhtml+xml"
2303 | None -> false
2304 else false
2305 in
2306 (* Get fragment context element - only for foreign namespace fragment contexts *)
2307 let fragment_context_elem = t.fragment_context_element in
2308 let rec pop () =
2309 match current_node t with
2310 | None -> ()
2311 | Some node ->
2312 if is_in_html_namespace node then ()
2313 else if is_html_integration_point node then ()
2314 (* Don't pop past fragment context element *)
2315 else (match fragment_context_elem with
2316 | Some ctx when node == ctx -> ()
2317 | _ ->
2318 pop_current t;
2319 pop ())
2320 in
2321 pop ()
2322
2323(* Foreign breakout elements - these break out of foreign content *)
2324and is_foreign_breakout_element name =
2325 List.mem (String.lowercase_ascii name)
2326 ["b"; "big"; "blockquote"; "body"; "br"; "center"; "code"; "dd"; "div"; "dl"; "dt";
2327 "em"; "embed"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "head"; "hr"; "i"; "img"; "li";
2328 "listing"; "menu"; "meta"; "nobr"; "ol"; "p"; "pre"; "ruby"; "s"; "small"; "span";
2329 "strong"; "strike"; "sub"; "sup"; "table"; "tt"; "u"; "ul"; "var"]
2330
2331and process_foreign_content t token =
2332 match token with
2333 | Token.Character data when String.contains data '\x00' ->
2334 (* Replace NUL characters with U+FFFD replacement character *)
2335 parse_error t "unexpected-null-character";
2336 let buf = Buffer.create (String.length data) in
2337 let has_non_ws_non_nul = ref false in
2338 String.iter (fun c ->
2339 if c = '\x00' then Buffer.add_string buf "\xEF\xBF\xBD"
2340 else begin
2341 Buffer.add_char buf c;
2342 if not (c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r') then
2343 has_non_ws_non_nul := true
2344 end
2345 ) data;
2346 let replaced = Buffer.contents buf in
2347 insert_character t replaced;
2348 (* Only set frameset_ok to false if there's actual non-whitespace non-NUL content *)
2349 if !has_non_ws_non_nul then t.frameset_ok <- false
2350 | Token.Character data when is_whitespace data ->
2351 insert_character t data
2352 | Token.Character data ->
2353 insert_character t data;
2354 t.frameset_ok <- false
2355 | Token.Comment data ->
2356 insert_comment t data
2357 | Token.Doctype _ ->
2358 parse_error t "unexpected-doctype"
2359 | Token.Tag { kind = Token.Start; name; _ } when is_foreign_breakout_element name ->
2360 (* Breakout from foreign content - pop until HTML or integration point, reprocess in HTML mode *)
2361 parse_error t "unexpected-html-element-in-foreign-content";
2362 pop_until_html_or_integration_point t;
2363 reset_insertion_mode t;
2364 (* Use process_by_mode to force HTML mode processing and avoid infinite loop *)
2365 process_by_mode t token
2366 | Token.Tag { kind = Token.Start; name = "font"; attrs; _ }
2367 when List.exists (fun (n, _) ->
2368 let n = String.lowercase_ascii n in
2369 n = "color" || n = "face" || n = "size") attrs ->
2370 (* font with color/face/size breaks out of foreign content *)
2371 parse_error t "unexpected-html-element-in-foreign-content";
2372 pop_until_html_or_integration_point t;
2373 reset_insertion_mode t;
2374 process_by_mode t token
2375 | Token.Tag { kind = Token.Start; name; attrs; self_closing } ->
2376 let name =
2377 match adjusted_current_node t with
2378 | Some n when n.Dom.namespace = Some "svg" -> Parser_constants.adjust_svg_tag_name name
2379 | _ -> name
2380 in
2381 let attrs =
2382 match adjusted_current_node t with
2383 | Some n when n.Dom.namespace = Some "svg" ->
2384 Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs attrs)
2385 | Some n when n.Dom.namespace = Some "mathml" ->
2386 Parser_constants.adjust_mathml_attrs (Parser_constants.adjust_foreign_attrs attrs)
2387 | _ -> Parser_constants.adjust_foreign_attrs attrs
2388 in
2389 let namespace =
2390 match adjusted_current_node t with
2391 | Some n -> n.Dom.namespace
2392 | None -> None
2393 in
2394 let node = insert_element t name ~namespace attrs in
2395 t.open_elements <- node :: t.open_elements;
2396 if self_closing then pop_current t
2397 | Token.Tag { kind = Token.End; name; _ } when List.mem (String.lowercase_ascii name) ["br"; "p"] ->
2398 (* Special case: </br> and </p> end tags trigger breakout from foreign content *)
2399 parse_error t "unexpected-html-element-in-foreign-content";
2400 pop_until_html_or_integration_point t;
2401 reset_insertion_mode t;
2402 (* Use process_by_mode to force HTML mode processing and avoid infinite loop *)
2403 process_by_mode t token
2404 | Token.Tag { kind = Token.End; name; _ } ->
2405 (* Find matching element per WHATWG spec for foreign content *)
2406 let is_fragment_context n =
2407 match t.fragment_context_element with
2408 | Some ctx -> n == ctx
2409 | None -> false
2410 in
2411 let name_lower = String.lowercase_ascii name in
2412 (* Walk through stack looking for matching element *)
2413 let rec find_and_process first_node idx = function
2414 | [] -> () (* Stack exhausted - ignore tag *)
2415 | n :: rest ->
2416 let node_name_lower = String.lowercase_ascii n.Dom.name in
2417 let is_html = is_in_html_namespace n in
2418 let name_matches = node_name_lower = name_lower in
2419
2420 (* If first node doesn't match tag name, it's a parse error *)
2421 if first_node && not name_matches then
2422 parse_error t "unexpected-end-tag-in-foreign-content";
2423
2424 (* Check if this node matches the end tag *)
2425 if name_matches then begin
2426 (* Fragment context check *)
2427 if is_fragment_context n then
2428 parse_error t "unexpected-end-tag-in-fragment-context"
2429 (* If matched element is in HTML namespace, reprocess via HTML mode *)
2430 else if is_html then
2431 process_by_mode t token
2432 (* Otherwise it's a foreign element - pop everything from this point up *)
2433 else begin
2434 (* Pop all elements from current down to and including the matched element *)
2435 let rec pop_to_idx current_idx =
2436 if current_idx >= idx then begin
2437 pop_current t;
2438 pop_to_idx (current_idx - 1)
2439 end
2440 in
2441 pop_to_idx (List.length t.open_elements - 1)
2442 end
2443 end
2444 (* If we hit an HTML element that doesn't match, process via HTML mode *)
2445 else if is_html then
2446 process_by_mode t token
2447 (* Continue searching in the stack *)
2448 else
2449 find_and_process false (idx - 1) rest
2450 in
2451 find_and_process true (List.length t.open_elements - 1) t.open_elements
2452 | Token.EOF ->
2453 process_by_mode t token
2454
2455and process_by_mode t token =
2456 match t.mode with
2457 | Parser_insertion_mode.Initial -> process_initial t token
2458 | Parser_insertion_mode.Before_html -> process_before_html t token
2459 | Parser_insertion_mode.Before_head -> process_before_head t token
2460 | Parser_insertion_mode.In_head -> process_in_head t token
2461 | Parser_insertion_mode.In_head_noscript -> process_in_head_noscript t token
2462 | Parser_insertion_mode.After_head -> process_after_head t token
2463 | Parser_insertion_mode.In_body -> process_in_body t token
2464 | Parser_insertion_mode.Text -> process_text t token
2465 | Parser_insertion_mode.In_table -> process_in_table t token
2466 | Parser_insertion_mode.In_table_text -> process_in_table_text t token
2467 | Parser_insertion_mode.In_caption -> process_in_caption t token
2468 | Parser_insertion_mode.In_column_group -> process_in_column_group t token
2469 | Parser_insertion_mode.In_table_body -> process_in_table_body t token
2470 | Parser_insertion_mode.In_row -> process_in_row t token
2471 | Parser_insertion_mode.In_cell -> process_in_cell t token
2472 | Parser_insertion_mode.In_select -> process_in_select t token
2473 | Parser_insertion_mode.In_select_in_table -> process_in_select_in_table t token
2474 | Parser_insertion_mode.In_template -> process_in_template t token
2475 | Parser_insertion_mode.After_body -> process_after_body t token
2476 | Parser_insertion_mode.In_frameset -> process_in_frameset t token
2477 | Parser_insertion_mode.After_frameset -> process_after_frameset t token
2478 | Parser_insertion_mode.After_after_body -> process_after_after_body t token
2479 | Parser_insertion_mode.After_after_frameset -> process_after_after_frameset t token
2480
2481(* Populate selectedcontent elements with content from selected option *)
2482let find_elements name node =
2483 let result = ref [] in
2484 let rec find n =
2485 if n.Dom.name = name then result := n :: !result;
2486 List.iter find n.Dom.children
2487 in
2488 find node;
2489 List.rev !result (* Reverse to maintain document order *)
2490
2491let find_element name node =
2492 let rec find n =
2493 if n.Dom.name = name then Some n
2494 else
2495 List.find_map find n.Dom.children
2496 in
2497 find node
2498
2499let populate_selectedcontent document =
2500 let selects = find_elements "select" document in
2501 List.iter (fun select ->
2502 match find_element "selectedcontent" select with
2503 | None -> ()
2504 | Some selectedcontent ->
2505 let options = find_elements "option" select in
2506 if options <> [] then begin
2507 (* Find selected option or use first *)
2508 let selected_option =
2509 match List.find_opt (fun opt -> Dom.has_attr opt "selected") options with
2510 | Some opt -> opt
2511 | None -> List.hd options
2512 in
2513 (* Clone children from selected option to selectedcontent *)
2514 List.iter (fun child ->
2515 let cloned = Dom.clone ~deep:true child in
2516 Dom.append_child selectedcontent cloned
2517 ) selected_option.Dom.children
2518 end
2519 ) selects
2520
2521let finish t =
2522 (* Populate selectedcontent elements *)
2523 populate_selectedcontent t.document;
2524 (* For fragment parsing, remove the html wrapper and promote children *)
2525 if t.fragment_context <> None then begin
2526 match t.document.Dom.children with
2527 | [root] when root.Dom.name = "html" ->
2528 (* Move context element's children to root if applicable *)
2529 (match t.fragment_context_element with
2530 | Some ctx_elem ->
2531 (match ctx_elem.Dom.parent with
2532 | Some p when p == root ->
2533 let ctx_children = ctx_elem.Dom.children in
2534 List.iter (fun child ->
2535 Dom.remove_child ctx_elem child;
2536 Dom.append_child root child
2537 ) ctx_children;
2538 Dom.remove_child root ctx_elem
2539 | _ -> ())
2540 | None -> ());
2541 (* Promote root's children to document - preserve order *)
2542 let children_copy = root.Dom.children in
2543 List.iter (fun child ->
2544 Dom.remove_child root child;
2545 Dom.append_child t.document child
2546 ) children_copy;
2547 Dom.remove_child t.document root
2548 | _ -> ()
2549 end;
2550 t.document
2551
2552let get_errors t = List.rev t.errors