OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** HTML5 DOM Node Types and Operations
7
8 This module provides the DOM (Document Object Model) node representation
9 used by the HTML5 parser. The DOM is a programming interface that
10 represents an HTML document as a tree of nodes, where each node represents
11 part of the document (an element, text content, comment, etc.).
12
13 {2 What is the DOM?}
14
15 When an HTML parser processes markup like [<p>Hello <b>world</b></p>], it
16 doesn't store the text directly. Instead, it builds a tree structure in
17 memory:
18
19 {v
20 Document
21 └── html
22 └── body
23 └── p
24 ├── #text "Hello "
25 └── b
26 └── #text "world"
27 v}
28
29 This tree is the DOM. Each box in the tree is a {i node}. Programs can
30 traverse and modify this tree to read or change the document.
31
32 @see <https://html.spec.whatwg.org/multipage/dom.html>
33 WHATWG: The elements of HTML (DOM chapter)
34
35 {2 Node Types}
36
37 The HTML5 DOM includes several node types, all represented by the same
38 record type with different field usage:
39
40 - {b Element nodes}: HTML elements like [<div>], [<p>], [<a href="...">].
41 Elements are the building blocks of HTML documents. They can have
42 attributes and contain other nodes.
43
44 - {b Text nodes}: The actual text content within elements. For example,
45 in [<p>Hello</p>], "Hello" is a text node that is a child of the [<p>]
46 element.
47
48 - {b Comment nodes}: HTML comments written as [<!-- comment text -->].
49 Comments are preserved in the DOM but not rendered.
50
51 - {b Document nodes}: The root of the entire document tree. Every HTML
52 document has exactly one Document node at the top.
53
54 - {b Document fragment nodes}: Lightweight containers that hold a
55 collection of nodes without a parent. Used for efficient batch DOM
56 operations and [<template>] element contents.
57
58 - {b Doctype nodes}: The [<!DOCTYPE html>] declaration at the start of
59 HTML5 documents. This declaration tells browsers to render the page
60 in standards mode.
61
62 @see <https://html.spec.whatwg.org/multipage/dom.html#kinds-of-content>
63 WHATWG: Kinds of content
64
65 {2 Namespaces}
66
67 HTML5 can embed content from other XML vocabularies. Elements belong to
68 one of three {i namespaces}:
69
70 - {b HTML namespace} ([None] or implicit): Standard HTML elements like
71 [<div>], [<p>], [<table>]. This is the default for all elements.
72
73 - {b SVG namespace} ([Some "svg"]): Scalable Vector Graphics for drawing.
74 When the parser encounters an [<svg>] tag, all elements inside it
75 (like [<rect>], [<circle>], [<path>]) are placed in the SVG namespace.
76
77 - {b MathML namespace} ([Some "mathml"]): Mathematical Markup Language
78 for equations. When the parser encounters a [<math>] tag, elements
79 inside it are placed in the MathML namespace.
80
81 The parser automatically switches namespaces when entering and leaving
82 these foreign content islands.
83
84 @see <https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign>
85 WHATWG: Parsing foreign content
86
87 {2 Tree Structure}
88
89 Nodes form a bidirectional tree: each node has a list of children and
90 an optional parent reference. Modification functions in this module
91 maintain these references automatically.
92
93 The tree is always well-formed: a node can only have one parent, and
94 circular references are not possible.
95*)
96
97(** {1 Types} *)
98
99(** Information associated with a DOCTYPE node.
100
101 The {i document type declaration} (DOCTYPE) tells browsers what version
102 of HTML the document uses. In HTML5, the standard declaration is simply:
103
104 {v <!DOCTYPE html> v}
105
106 This minimal DOCTYPE triggers {i standards mode} (no quirks). The DOCTYPE
107 can optionally include a public identifier and system identifier for
108 legacy compatibility with SGML-based tools, but these are rarely used
109 in modern HTML5 documents.
110
111 {b Historical context:} In HTML4 and XHTML, DOCTYPEs were verbose and
112 referenced DTD files. For example:
113 {v <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
114 "http://www.w3.org/TR/html4/strict.dtd"> v}
115
116 HTML5 simplified this to just [<!DOCTYPE html>] because:
117 - Browsers never actually fetched or validated against DTDs
118 - The DOCTYPE's only real purpose is triggering standards mode
119 - A minimal DOCTYPE achieves this goal
120
121 {b Field meanings:}
122 - [name]: The document type name, almost always ["html"] for HTML documents
123 - [public_id]: A public identifier (legacy); [None] for HTML5
124 - [system_id]: A system identifier/URL (legacy); [None] for HTML5
125
126 @see <https://html.spec.whatwg.org/multipage/syntax.html#the-doctype>
127 WHATWG: The DOCTYPE
128 @see <https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode>
129 WHATWG: DOCTYPE handling during parsing
130*)
131type doctype_data = {
132 name : string option; (** The DOCTYPE name, e.g., "html" *)
133 public_id : string option; (** Public identifier (legacy, rarely used) *)
134 system_id : string option; (** System identifier (legacy, rarely used) *)
135}
136
137val pp_doctype_data : Format.formatter -> doctype_data -> unit
138(** Pretty-print DOCTYPE data. *)
139
140(** Quirks mode setting for the document.
141
142 {i Quirks mode} is a browser rendering mode that emulates bugs and
143 non-standard behaviors from older browsers (primarily Internet Explorer 5).
144 Modern HTML5 documents should always render in {i standards mode}
145 (no quirks) for consistent, predictable behavior.
146
147 The HTML5 parser determines quirks mode based on the DOCTYPE declaration:
148
149 - {b No_quirks} (Standards mode): The document renders according to modern
150 HTML5 and CSS specifications. This is triggered by [<!DOCTYPE html>].
151 CSS box model, table layout, and other features work as specified.
152
153 - {b Quirks} (Full quirks mode): The document renders with legacy browser
154 bugs emulated. This happens when:
155 {ul
156 {- DOCTYPE is missing entirely}
157 {- DOCTYPE has certain legacy public identifiers}
158 {- DOCTYPE has the wrong format}}
159
160 In quirks mode, many CSS properties behave differently:
161 {ul
162 {- Tables don't inherit font properties}
163 {- Box model uses non-standard width calculations}
164 {- Certain CSS selectors don't work correctly}}
165
166 - {b Limited_quirks} (Almost standards mode): A middle ground that applies
167 only a few specific quirks, primarily affecting table cell vertical
168 sizing. Triggered by XHTML DOCTYPEs and certain HTML4 DOCTYPEs.
169
170 {b Recommendation:} Always use [<!DOCTYPE html>] at the start of HTML5
171 documents to ensure {b No_quirks} mode.
172
173 @see <https://quirks.spec.whatwg.org/>
174 Quirks Mode Standard - detailed specification
175 @see <https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode>
176 WHATWG: How the parser determines quirks mode
177*)
178type quirks_mode = No_quirks | Quirks | Limited_quirks
179
180val pp_quirks_mode : Format.formatter -> quirks_mode -> unit
181(** Pretty-print quirks mode. *)
182
183(** Source location where a node was parsed.
184
185 Location tracking enables error messages to point to specific lines
186 and columns in the source document where validation issues occur.
187*)
188type location = {
189 line : int; (** Line number (1-indexed) *)
190 column : int; (** Column number (1-indexed) *)
191 end_line : int option; (** End line for multi-line spans *)
192 end_column : int option; (** End column for multi-line spans *)
193}
194
195(** A DOM node in the parsed document tree.
196
197 All node types use the same record structure. The [name] field determines
198 the node type:
199 - Element: the tag name (e.g., "div", "p", "span")
200 - Text: "#text"
201 - Comment: "#comment"
202 - Document: "#document"
203 - Document fragment: "#document-fragment"
204 - Doctype: "!doctype"
205
206 {3 Understanding Node Fields}
207
208 Different node types use different combinations of fields:
209
210 {v
211 Node Type | name | namespace | attrs | data | template_content | doctype
212 ------------------|------------------|-----------|-------|------|------------------|--------
213 Element | tag name | Yes | Yes | No | If <template> | No
214 Text | "#text" | No | No | Yes | No | No
215 Comment | "#comment" | No | No | Yes | No | No
216 Document | "#document" | No | No | No | No | No
217 Document Fragment | "#document-frag" | No | No | No | No | No
218 Doctype | "!doctype" | No | No | No | No | Yes
219 v}
220
221 {3 Element Tag Names}
222
223 For element nodes, the [name] field contains the lowercase tag name.
224 HTML5 defines many elements with specific meanings:
225
226 {b Structural elements:} [html], [head], [body], [header], [footer],
227 [main], [nav], [article], [section], [aside]
228
229 {b Text content:} [p], [div], [span], [h1]-[h6], [pre], [blockquote]
230
231 {b Lists:} [ul], [ol], [li], [dl], [dt], [dd]
232
233 {b Tables:} [table], [tr], [td], [th], [thead], [tbody], [tfoot]
234
235 {b Forms:} [form], [input], [button], [select], [textarea], [label]
236
237 {b Media:} [img], [audio], [video], [canvas], [svg]
238
239 @see <https://html.spec.whatwg.org/multipage/indices.html#elements-3>
240 WHATWG: Index of HTML elements
241
242 {3 Void Elements}
243
244 Some elements are {i void elements} - they cannot have children and have
245 no end tag. These include: [area], [base], [br], [col], [embed], [hr],
246 [img], [input], [link], [meta], [source], [track], [wbr].
247
248 @see <https://html.spec.whatwg.org/multipage/syntax.html#void-elements>
249 WHATWG: Void elements
250
251 {3 The Template Element}
252
253 The [<template>] element is special: its children are not rendered
254 directly but stored in a separate document fragment accessible via
255 the [template_content] field. Templates are used for client-side
256 templating where content is cloned and inserted via JavaScript.
257
258 @see <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>
259 WHATWG: The template element
260*)
261type node = {
262 mutable name : string;
263 (** Tag name for elements, or special name for other node types.
264
265 For elements, this is the lowercase tag name (e.g., "div", "span").
266 For other node types, use the constants {!document_name},
267 {!text_name}, {!comment_name}, etc. *)
268
269 mutable namespace : string option;
270 (** Element namespace: [None] for HTML, [Some "svg"], [Some "mathml"].
271
272 Most elements are in the HTML namespace ([None]). The SVG and MathML
273 namespaces are only used when content appears inside [<svg>] or
274 [<math>] elements respectively.
275
276 @see <https://html.spec.whatwg.org/multipage/dom.html#elements-in-the-dom>
277 WHATWG: Elements in the DOM *)
278
279 mutable attrs : (string * string) list;
280 (** Element attributes as (name, value) pairs.
281
282 Attributes provide additional information about elements. Common
283 global attributes include:
284 - [id]: Unique identifier for the element
285 - [class]: Space-separated list of CSS class names
286 - [style]: Inline CSS styles
287 - [title]: Advisory text (shown as tooltip)
288 - [lang]: Language of the element's content
289 - [hidden]: Whether the element should be hidden
290
291 Element-specific attributes include:
292 - [href] on [<a>]: The link destination URL
293 - [src] on [<img>]: The image source URL
294 - [type] on [<input>]: The input control type
295 - [disabled] on form controls: Whether the control is disabled
296
297 In HTML5, attribute names are case-insensitive and are normalized
298 to lowercase by the parser.
299
300 @see <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>
301 WHATWG: Global attributes
302 @see <https://html.spec.whatwg.org/multipage/indices.html#attributes-3>
303 WHATWG: Index of attributes *)
304
305 mutable children : node list;
306 (** Child nodes in document order.
307
308 For most elements, this list contains the nested elements and text.
309 For void elements (like [<br>], [<img>]), this is always empty.
310 For [<template>] elements, the actual content is in
311 [template_content], not here. *)
312
313 mutable parent : node option;
314 (** Parent node, [None] for root nodes.
315
316 Every node except the Document node has a parent. This back-reference
317 enables traversing up the tree. *)
318
319 mutable data : string;
320 (** Text content for text and comment nodes.
321
322 For text nodes, this contains the actual text. For comment nodes,
323 this contains the comment text (without the [<!--] and [-->]
324 delimiters). For other node types, this field is empty. *)
325
326 mutable template_content : node option;
327 (** Document fragment for [<template>] element contents.
328
329 The [<template>] element holds "inert" content that is not
330 rendered but can be cloned and inserted elsewhere. This field
331 contains a document fragment with the template's content.
332
333 For non-template elements, this is [None].
334
335 @see <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>
336 WHATWG: The template element *)
337
338 mutable doctype : doctype_data option;
339 (** DOCTYPE information for doctype nodes.
340
341 Only doctype nodes use this field; for all other nodes it is [None]. *)
342
343 mutable location : location option;
344 (** Source location where this node was parsed.
345
346 This field enables validation error messages to include line and column
347 numbers. It is [None] for nodes created programmatically rather than
348 by parsing. *)
349}
350
351val pp : Format.formatter -> node -> unit
352(** Pretty-print a DOM node. Prints a summary representation showing the
353 node type and key attributes. Does not recursively print children. *)
354
355(** {1 Node Name Constants}
356
357 These constants identify special node types. Compare with [node.name]
358 to determine the node type.
359*)
360
361val document_name : string
362(** ["#document"] - name for document nodes.
363
364 The Document node is the root of every HTML document tree. It represents
365 the entire document and is the parent of the [<html>] element.
366
367 @see <https://html.spec.whatwg.org/multipage/dom.html#document>
368 WHATWG: The Document object *)
369
370val document_fragment_name : string
371(** ["#document-fragment"] - name for document fragment nodes.
372
373 Document fragments are lightweight container nodes used to hold a
374 collection of nodes without a parent document. They are used:
375 - To hold [<template>] element contents
376 - As results of fragment parsing (innerHTML)
377 - For efficient batch DOM operations
378
379 @see <https://dom.spec.whatwg.org/#documentfragment>
380 DOM Standard: DocumentFragment *)
381
382val text_name : string
383(** ["#text"] - name for text nodes.
384
385 Text nodes contain the character data within elements. When the
386 parser encounters text between tags like [<p>Hello world</p>],
387 it creates a text node with data ["Hello world"] as a child of
388 the [<p>] element.
389
390 Adjacent text nodes are automatically merged by the parser. *)
391
392val comment_name : string
393(** ["#comment"] - name for comment nodes.
394
395 Comment nodes represent HTML comments: [<!-- comment text -->].
396 Comments are preserved in the DOM but not rendered to users.
397 They're useful for development notes or conditional content. *)
398
399val doctype_name : string
400(** ["!doctype"] - name for doctype nodes.
401
402 The DOCTYPE node represents the [<!DOCTYPE html>] declaration.
403 It is always the first child of the Document node (if present).
404
405 @see <https://html.spec.whatwg.org/multipage/syntax.html#the-doctype>
406 WHATWG: The DOCTYPE *)
407
408(** {1 Constructors}
409
410 Functions to create new DOM nodes. All nodes start with no parent and
411 no children. Use {!append_child} or {!insert_before} to build a tree.
412*)
413
414val create_element : string -> ?namespace:string option ->
415 ?attrs:(string * string) list -> ?location:location -> unit -> node
416(** Create an element node.
417
418 Elements are the primary building blocks of HTML documents. Each
419 element represents a component of the document with semantic meaning.
420
421 @param name The tag name (e.g., "div", "p", "span"). Tag names are
422 case-insensitive in HTML; by convention, use lowercase.
423 @param namespace Element namespace:
424 - [None] (default): HTML namespace for standard elements
425 - [Some "svg"]: SVG namespace for graphics elements
426 - [Some "mathml"]: MathML namespace for mathematical notation
427 @param attrs Initial attributes as [(name, value)] pairs
428
429 {b Examples:}
430 {[
431 (* Simple HTML element *)
432 let div = create_element "div" ()
433
434 (* Element with attributes *)
435 let link = create_element "a"
436 ~attrs:[("href", "https://example.com"); ("class", "external")]
437 ()
438
439 (* SVG element *)
440 let rect = create_element "rect"
441 ~namespace:(Some "svg")
442 ~attrs:[("width", "100"); ("height", "50"); ("fill", "blue")]
443 ()
444 ]}
445
446 @see <https://html.spec.whatwg.org/multipage/dom.html#elements-in-the-dom>
447 WHATWG: Elements in the DOM
448*)
449
450val create_text : ?location:location -> string -> node
451(** Create a text node with the given content.
452
453 Text nodes contain the readable content of HTML documents. They
454 appear as children of elements and represent the characters that
455 users see.
456
457 {b Note:} Text content is stored as-is. Character references like
458 [&] should already be decoded to their character values.
459
460 {b Example:}
461 {[
462 let text = create_text "Hello, world!"
463 (* To put text in a paragraph: *)
464 let p = create_element "p" () in
465 append_child p text
466 ]}
467*)
468
469val create_comment : ?location:location -> string -> node
470(** Create a comment node with the given content.
471
472 Comments are human-readable notes in HTML that don't appear in
473 the rendered output. They're written as [<!-- comment -->] in HTML.
474
475 @param data The comment text (without the [<!--] and [-->] delimiters)
476
477 {b Example:}
478 {[
479 let comment = create_comment " TODO: Add navigation "
480 (* Represents: <!-- TODO: Add navigation --> *)
481 ]}
482
483 @see <https://html.spec.whatwg.org/multipage/syntax.html#comments>
484 WHATWG: HTML comments
485*)
486
487val create_document : unit -> node
488(** Create an empty document node.
489
490 The Document node is the root of an HTML document tree. It represents
491 the entire document and serves as the parent for the DOCTYPE (if any)
492 and the root [<html>] element.
493
494 In a complete HTML document, the structure is:
495 {v
496 #document
497 ├── !doctype
498 └── html
499 ├── head
500 └── body
501 v}
502
503 @see <https://html.spec.whatwg.org/multipage/dom.html#document>
504 WHATWG: The Document object
505*)
506
507val create_document_fragment : unit -> node
508(** Create an empty document fragment.
509
510 Document fragments are lightweight containers that can hold multiple
511 nodes without being part of the main document tree. They're useful for:
512
513 - {b Template contents:} The [<template>] element stores its children
514 in a document fragment, keeping them inert until cloned
515
516 - {b Fragment parsing:} When parsing HTML fragments (like innerHTML),
517 the result is placed in a document fragment
518
519 - {b Batch operations:} Build a subtree in a fragment, then insert it
520 into the document in one operation for better performance
521
522 @see <https://dom.spec.whatwg.org/#documentfragment>
523 DOM Standard: DocumentFragment
524*)
525
526val create_doctype : ?name:string -> ?public_id:string ->
527 ?system_id:string -> ?location:location -> unit -> node
528(** Create a DOCTYPE node.
529
530 The DOCTYPE declaration tells browsers to use standards mode for
531 rendering. For HTML5 documents, use:
532
533 {[
534 let doctype = create_doctype ~name:"html" ()
535 (* Represents: <!DOCTYPE html> *)
536 ]}
537
538 @param name DOCTYPE name (usually ["html"] for HTML documents)
539 @param public_id Public identifier (legacy, rarely needed)
540 @param system_id System identifier (legacy, rarely needed)
541
542 {b Legacy example:}
543 {[
544 (* HTML 4.01 Strict DOCTYPE - not recommended for new documents *)
545 let legacy = create_doctype
546 ~name:"HTML"
547 ~public_id:"-//W3C//DTD HTML 4.01//EN"
548 ~system_id:"http://www.w3.org/TR/html4/strict.dtd"
549 ()
550 ]}
551
552 @see <https://html.spec.whatwg.org/multipage/syntax.html#the-doctype>
553 WHATWG: The DOCTYPE
554*)
555
556val create_template : ?namespace:string option ->
557 ?attrs:(string * string) list -> ?location:location -> unit -> node
558(** Create a [<template>] element with its content document fragment.
559
560 The [<template>] element holds inert HTML content that is not
561 rendered directly. The content is stored in a separate document
562 fragment and can be:
563 - Cloned and inserted into the document via JavaScript
564 - Used as a stamping template for repeated content
565 - Pre-parsed without affecting the page
566
567 {b How templates work:}
568
569 Unlike normal elements, a [<template>]'s children are not rendered.
570 Instead, they're stored in the [template_content] field. This means:
571 - Images inside won't load
572 - Scripts inside won't execute
573 - The content is "inert" until explicitly activated
574
575 {b Example:}
576 {[
577 let template = create_template () in
578 let div = create_element "div" () in
579 let text = create_text "Template content" in
580 append_child div text;
581 (* Add to template's content fragment, not children *)
582 match template.template_content with
583 | Some fragment -> append_child fragment div
584 | None -> ()
585 ]}
586
587 @see <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>
588 WHATWG: The template element
589*)
590
591(** {1 Node Type Predicates}
592
593 Functions to test what type of node you have. Since all nodes use the
594 same record type, these predicates check the [name] field to determine
595 the actual node type.
596*)
597
598val is_element : node -> bool
599(** [is_element node] returns [true] if the node is an element node.
600
601 Elements are HTML tags like [<div>], [<p>], [<a>]. They are
602 identified by having a tag name that doesn't match any of the
603 special node name constants.
604*)
605
606val is_text : node -> bool
607(** [is_text node] returns [true] if the node is a text node.
608
609 Text nodes contain the character content within elements.
610 They have [name = "#text"]. *)
611
612val is_comment : node -> bool
613(** [is_comment node] returns [true] if the node is a comment node.
614
615 Comment nodes represent HTML comments [<!-- ... -->].
616 They have [name = "#comment"]. *)
617
618val is_document : node -> bool
619(** [is_document node] returns [true] if the node is a document node.
620
621 The document node is the root of the DOM tree.
622 It has [name = "#document"]. *)
623
624val is_document_fragment : node -> bool
625(** [is_document_fragment node] returns [true] if the node is a document fragment.
626
627 Document fragments are lightweight containers.
628 They have [name = "#document-fragment"]. *)
629
630val is_doctype : node -> bool
631(** [is_doctype node] returns [true] if the node is a DOCTYPE node.
632
633 DOCTYPE nodes represent the [<!DOCTYPE>] declaration.
634 They have [name = "!doctype"]. *)
635
636val has_children : node -> bool
637(** [has_children node] returns [true] if the node has any children.
638
639 Note: For [<template>] elements, this checks the direct children list,
640 not the template content fragment. *)
641
642(** {1 Tree Manipulation}
643
644 Functions to modify the DOM tree structure. These functions automatically
645 maintain parent/child references, ensuring the tree remains consistent.
646*)
647
648val append_child : node -> node -> unit
649(** [append_child parent child] adds [child] as the last child of [parent].
650
651 The child's parent reference is updated to point to [parent].
652 If the child already has a parent, it is first removed from that parent.
653
654 {b Example:}
655 {[
656 let body = create_element "body" () in
657 let p = create_element "p" () in
658 let text = create_text "Hello!" in
659 append_child p text;
660 append_child body p
661 (* Result:
662 body
663 └── p
664 └── #text "Hello!"
665 *)
666 ]}
667*)
668
669val insert_before : node -> node -> node -> unit
670(** [insert_before parent new_child ref_child] inserts [new_child] before
671 [ref_child] in [parent]'s children.
672
673 @param parent The parent node
674 @param new_child The node to insert
675 @param ref_child The existing child to insert before
676
677 Raises [Not_found] if [ref_child] is not a child of [parent].
678
679 {b Example:}
680 {[
681 let ul = create_element "ul" () in
682 let li1 = create_element "li" () in
683 let li3 = create_element "li" () in
684 append_child ul li1;
685 append_child ul li3;
686 let li2 = create_element "li" () in
687 insert_before ul li2 li3
688 (* Result: ul contains li1, li2, li3 in that order *)
689 ]}
690*)
691
692val remove_child : node -> node -> unit
693(** [remove_child parent child] removes [child] from [parent]'s children.
694
695 The child's parent reference is set to [None].
696
697 Raises [Not_found] if [child] is not a child of [parent].
698*)
699
700val insert_text_at : node -> string -> node option -> unit
701(** [insert_text_at parent text before_node] inserts text content.
702
703 If [before_node] is [None], appends at the end. If the previous sibling
704 is a text node, the text is merged into it (text nodes are coalesced).
705 Otherwise, a new text node is created.
706
707 This implements the HTML5 parser's text insertion algorithm which
708 ensures adjacent text nodes are always merged, matching browser behavior.
709
710 @see <https://html.spec.whatwg.org/multipage/parsing.html#appropriate-place-for-inserting-a-node>
711 WHATWG: Inserting text in the DOM
712*)
713
714(** {1 Attribute Operations}
715
716 Functions to read and modify element attributes. Attributes are
717 name-value pairs that provide additional information about elements.
718
719 In HTML5, attribute names are case-insensitive and normalized to
720 lowercase by the parser.
721
722 @see <https://html.spec.whatwg.org/multipage/dom.html#attributes>
723 WHATWG: Attributes
724*)
725
726val get_attr : node -> string -> string option
727(** [get_attr node name] returns the value of attribute [name], or [None]
728 if the attribute doesn't exist.
729
730 Attribute lookup is case-sensitive on the stored (lowercase) names.
731*)
732
733val set_attr : node -> string -> string -> unit
734(** [set_attr node name value] sets attribute [name] to [value].
735
736 If the attribute already exists, it is replaced.
737 If it doesn't exist, it is added.
738*)
739
740val has_attr : node -> string -> bool
741(** [has_attr node name] returns [true] if the node has attribute [name]. *)
742
743(** {1 Space-Separated Attribute Values}
744
745 Many HTML attributes contain space-separated lists of values. For example,
746 the [class] attribute contains CSS class names: [class="header main active"].
747 These functions parse such attributes into OCaml lists.
748
749 Per the HTML5 spec, "ASCII whitespace" (space, tab, newline, carriage return,
750 form feed) is used as the separator.
751*)
752
753val split_on_whitespace : string -> string list
754(** [split_on_whitespace s] splits a string on ASCII whitespace.
755
756 This implements the HTML5 "split on ASCII whitespace" algorithm used
757 for parsing space-separated attribute values.
758
759 {b Example:}
760 {[
761 split_on_whitespace "foo bar\tbaz"
762 (* Returns: ["foo"; "bar"; "baz"] *)
763 ]}
764*)
765
766val get_attr_list : node -> string -> string list
767(** [get_attr_list node name] returns a space-separated attribute as a list.
768
769 Returns an empty list if the attribute doesn't exist.
770
771 {b Example:}
772 {[
773 (* For <div class="foo bar baz"> *)
774 get_attr_list div "class"
775 (* Returns: ["foo"; "bar"; "baz"] *)
776 ]}
777*)
778
779val get_class_list : node -> string list
780(** [get_class_list node] returns the class attribute as a list of class names.
781
782 Equivalent to [get_attr_list node "class"].
783
784 {b Example:}
785 {[
786 (* For <div class="container main"> *)
787 get_class_list div
788 (* Returns: ["container"; "main"] *)
789 ]}
790*)
791
792val get_rel_list : node -> string list
793(** [get_rel_list node] returns the rel attribute as a list of link types.
794
795 Link types are lowercased since they are case-insensitive per HTML5 spec.
796
797 {b Example:}
798 {[
799 (* For <link rel="stylesheet preload"> *)
800 get_rel_list link
801 (* Returns: ["stylesheet"; "preload"] *)
802 ]}
803*)
804
805val get_headers_list : node -> string list
806(** [get_headers_list node] returns the headers attribute as a list of IDs.
807
808 Used on [<td>] and [<th>] elements to associate cells with headers.
809*)
810
811val get_itemref_list : node -> string list
812(** [get_itemref_list node] returns the itemref attribute as a list of IDs.
813
814 Used for microdata to reference elements by ID.
815*)
816
817val get_itemprop_list : node -> string list
818(** [get_itemprop_list node] returns the itemprop attribute as a list.
819
820 Used for microdata property names.
821*)
822
823val get_itemtype_list : node -> string list
824(** [get_itemtype_list node] returns the itemtype attribute as a list of URLs.
825
826 Used for microdata type URLs.
827*)
828
829(** {1 Location Helpers}
830
831 Functions to manage source location information for nodes.
832*)
833
834val make_location : line:int -> column:int -> ?end_line:int -> ?end_column:int ->
835 unit -> location
836(** [make_location ~line ~column ()] creates a source location record.
837
838 @param line Start line number (1-indexed)
839 @param column Start column number (1-indexed)
840 @param end_line Optional end line for multi-line spans
841 @param end_column Optional end column for multi-line spans
842*)
843
844val set_location : node -> line:int -> column:int -> ?end_line:int ->
845 ?end_column:int -> unit -> unit
846(** [set_location node ~line ~column ()] sets the source location of a node. *)
847
848val get_location : node -> location option
849(** [get_location node] returns the source location if set, or [None]. *)
850
851(** {1 Tree Traversal}
852
853 Functions to navigate the DOM tree.
854*)
855
856val descendants : node -> node list
857(** [descendants node] returns all descendant nodes in document order.
858
859 This performs a depth-first traversal, returning children before
860 siblings at each level. The node itself is not included.
861
862 {b Document order} is the order nodes appear in the HTML source:
863 parent before children, earlier siblings before later ones.
864
865 {b Example:}
866 {[
867 (* For tree: div > (p > "hello", span > "world") *)
868 descendants div
869 (* Returns: [p; text("hello"); span; text("world")] *)
870 ]}
871*)
872
873val ancestors : node -> node list
874(** [ancestors node] returns all ancestor nodes from parent to root.
875
876 The first element is the immediate parent, the last is the root
877 (usually the Document node).
878
879 {b Example:}
880 {[
881 (* For a text node inside: html > body > p > text *)
882 ancestors text_node
883 (* Returns: [p; body; html; #document] *)
884 ]}
885*)
886
887val get_text_content : node -> string
888(** [get_text_content node] returns the concatenated text content.
889
890 For text nodes, returns the text data directly.
891 For elements, recursively concatenates all descendant text content.
892 For other node types, returns an empty string.
893
894 {b Example:}
895 {[
896 (* For: <p>Hello <b>world</b>!</p> *)
897 get_text_content p_element
898 (* Returns: "Hello world!" *)
899 ]}
900*)
901
902(** {1 Cloning} *)
903
904val clone : ?deep:bool -> node -> node
905(** [clone ?deep node] creates a copy of the node.
906
907 @param deep If [true], recursively clone all descendants (default: [false])
908
909 The cloned node has no parent. With [deep:false], only the node itself
910 is copied (with its attributes, but not its children).
911
912 {b Example:}
913 {[
914 let original = create_element "div" ~attrs:[("class", "box")] () in
915 let shallow = clone original in
916 let deep = clone ~deep:true original
917 ]}
918*)