+1
dune
+1
dune
···
···
1
+
(data_only_dirs third_party)
+837
lib/dom/html5rw_dom.mli
+837
lib/dom/html5rw_dom.mli
···
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** HTML5 DOM Types and Operations
7
+
8
+
This module provides the DOM (Document Object Model) node representation
9
+
used by the HTML5 parser. The DOM is a programming interface that
10
+
represents an HTML document as a tree of nodes, where each node represents
11
+
part of the document (an element, text content, comment, etc.).
12
+
13
+
{2 What is the DOM?}
14
+
15
+
When an HTML parser processes markup like [<p>Hello <b>world</b></p>], it
16
+
doesn't store the text directly. Instead, it builds a tree structure in
17
+
memory:
18
+
19
+
{v
20
+
Document
21
+
└── html
22
+
└── body
23
+
└── p
24
+
├── #text "Hello "
25
+
└── b
26
+
└── #text "world"
27
+
v}
28
+
29
+
This tree is the DOM. Each box in the tree is a {i node}. Programs can
30
+
traverse and modify this tree to read or change the document.
31
+
32
+
@see <https://html.spec.whatwg.org/multipage/dom.html>
33
+
WHATWG: The elements of HTML (DOM chapter)
34
+
35
+
{2 Node Types}
36
+
37
+
The HTML5 DOM includes several node types, all represented by the same
38
+
record type with different field usage:
39
+
40
+
- {b Element nodes}: HTML elements like [<div>], [<p>], [<a href="...">].
41
+
Elements are the building blocks of HTML documents. They can have
42
+
attributes and contain other nodes.
43
+
44
+
- {b Text nodes}: The actual text content within elements. For example,
45
+
in [<p>Hello</p>], "Hello" is a text node that is a child of the [<p>]
46
+
element.
47
+
48
+
- {b Comment nodes}: HTML comments written as [<!-- comment text -->].
49
+
Comments are preserved in the DOM but not rendered.
50
+
51
+
- {b Document nodes}: The root of the entire document tree. Every HTML
52
+
document has exactly one Document node at the top.
53
+
54
+
- {b Document fragment nodes}: Lightweight containers that hold a
55
+
collection of nodes without a parent. Used for efficient batch DOM
56
+
operations and [<template>] element contents.
57
+
58
+
- {b Doctype nodes}: The [<!DOCTYPE html>] declaration at the start of
59
+
HTML5 documents. This declaration tells browsers to render the page
60
+
in standards mode.
61
+
62
+
@see <https://html.spec.whatwg.org/multipage/dom.html#kinds-of-content>
63
+
WHATWG: Kinds of content
64
+
65
+
{2 Namespaces}
66
+
67
+
HTML5 can embed content from other XML vocabularies. Elements belong to
68
+
one of three {i namespaces}:
69
+
70
+
- {b HTML namespace} ([None] or implicit): Standard HTML elements like
71
+
[<div>], [<p>], [<table>]. This is the default for all elements.
72
+
73
+
- {b SVG namespace} ([Some "svg"]): Scalable Vector Graphics for drawing.
74
+
When the parser encounters an [<svg>] tag, all elements inside it
75
+
(like [<rect>], [<circle>], [<path>]) are placed in the SVG namespace.
76
+
77
+
- {b MathML namespace} ([Some "mathml"]): Mathematical Markup Language
78
+
for equations. When the parser encounters a [<math>] tag, elements
79
+
inside it are placed in the MathML namespace.
80
+
81
+
The parser automatically switches namespaces when entering and leaving
82
+
these foreign content islands.
83
+
84
+
@see <https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign>
85
+
WHATWG: Parsing foreign content
86
+
87
+
{2 Tree Structure}
88
+
89
+
Nodes form a bidirectional tree: each node has a list of children and
90
+
an optional parent reference. Modification functions in this module
91
+
maintain these references automatically.
92
+
93
+
The tree is always well-formed: a node can only have one parent, and
94
+
circular references are not possible.
95
+
*)
96
+
97
+
(** {1 Types} *)
98
+
99
+
(** Information associated with a DOCTYPE node.
100
+
101
+
The {i document type declaration} (DOCTYPE) tells browsers what version
102
+
of HTML the document uses. In HTML5, the standard declaration is simply:
103
+
104
+
{v <!DOCTYPE html> v}
105
+
106
+
This minimal DOCTYPE triggers {i standards mode} (no quirks). The DOCTYPE
107
+
can optionally include a public identifier and system identifier for
108
+
legacy compatibility with SGML-based tools, but these are rarely used
109
+
in modern HTML5 documents.
110
+
111
+
{b Historical context:} In HTML4 and XHTML, DOCTYPEs were verbose and
112
+
referenced DTD files. For example:
113
+
{v <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
114
+
"http://www.w3.org/TR/html4/strict.dtd"> v}
115
+
116
+
HTML5 simplified this to just [<!DOCTYPE html>] because:
117
+
- Browsers never actually fetched or validated against DTDs
118
+
- The DOCTYPE's only real purpose is triggering standards mode
119
+
- A minimal DOCTYPE achieves this goal
120
+
121
+
{b Field meanings:}
122
+
- [name]: The document type name, almost always ["html"] for HTML documents
123
+
- [public_id]: A public identifier (legacy); [None] for HTML5
124
+
- [system_id]: A system identifier/URL (legacy); [None] for HTML5
125
+
126
+
@see <https://html.spec.whatwg.org/multipage/syntax.html#the-doctype>
127
+
WHATWG: The DOCTYPE
128
+
@see <https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode>
129
+
WHATWG: DOCTYPE handling during parsing
130
+
*)
131
+
type doctype_data = Node.doctype_data = {
132
+
name : string option; (** The DOCTYPE name, e.g., "html" *)
133
+
public_id : string option; (** Public identifier (legacy, rarely used) *)
134
+
system_id : string option; (** System identifier (legacy, rarely used) *)
135
+
}
136
+
137
+
(** Quirks mode setting for the document.
138
+
139
+
{i Quirks mode} is a browser rendering mode that emulates bugs and
140
+
non-standard behaviors from older browsers (primarily Internet Explorer 5).
141
+
Modern HTML5 documents should always render in {i standards mode}
142
+
(no quirks) for consistent, predictable behavior.
143
+
144
+
The HTML5 parser determines quirks mode based on the DOCTYPE declaration:
145
+
146
+
- {b No_quirks} (Standards mode): The document renders according to modern
147
+
HTML5 and CSS specifications. This is triggered by [<!DOCTYPE html>].
148
+
CSS box model, table layout, and other features work as specified.
149
+
150
+
- {b Quirks} (Full quirks mode): The document renders with legacy browser
151
+
bugs emulated. This happens when:
152
+
{ul
153
+
{- DOCTYPE is missing entirely}
154
+
{- DOCTYPE has certain legacy public identifiers}
155
+
{- DOCTYPE has the wrong format}}
156
+
157
+
In quirks mode, many CSS properties behave differently:
158
+
{ul
159
+
{- Tables don't inherit font properties}
160
+
{- Box model uses non-standard width calculations}
161
+
{- Certain CSS selectors don't work correctly}}
162
+
163
+
- {b Limited_quirks} (Almost standards mode): A middle ground that applies
164
+
only a few specific quirks, primarily affecting table cell vertical
165
+
sizing. Triggered by XHTML DOCTYPEs and certain HTML4 DOCTYPEs.
166
+
167
+
{b Recommendation:} Always use [<!DOCTYPE html>] at the start of HTML5
168
+
documents to ensure {b No_quirks} mode.
169
+
170
+
@see <https://quirks.spec.whatwg.org/>
171
+
Quirks Mode Standard - detailed specification
172
+
@see <https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode>
173
+
WHATWG: How the parser determines quirks mode
174
+
*)
175
+
type quirks_mode = Node.quirks_mode = No_quirks | Quirks | Limited_quirks
176
+
177
+
(** A DOM node in the parsed document tree.
178
+
179
+
All node types use the same record structure. The [name] field determines
180
+
the node type:
181
+
- Element: the tag name (e.g., "div", "p", "span")
182
+
- Text: "#text"
183
+
- Comment: "#comment"
184
+
- Document: "#document"
185
+
- Document fragment: "#document-fragment"
186
+
- Doctype: "!doctype"
187
+
188
+
{3 Understanding Node Fields}
189
+
190
+
Different node types use different combinations of fields:
191
+
192
+
{v
193
+
Node Type | name | namespace | attrs | data | template_content | doctype
194
+
------------------|------------------|-----------|-------|------|------------------|--------
195
+
Element | tag name | Yes | Yes | No | If <template> | No
196
+
Text | "#text" | No | No | Yes | No | No
197
+
Comment | "#comment" | No | No | Yes | No | No
198
+
Document | "#document" | No | No | No | No | No
199
+
Document Fragment | "#document-frag" | No | No | No | No | No
200
+
Doctype | "!doctype" | No | No | No | No | Yes
201
+
v}
202
+
203
+
{3 Element Tag Names}
204
+
205
+
For element nodes, the [name] field contains the lowercase tag name.
206
+
HTML5 defines many elements with specific meanings:
207
+
208
+
{b Structural elements:} [html], [head], [body], [header], [footer],
209
+
[main], [nav], [article], [section], [aside]
210
+
211
+
{b Text content:} [p], [div], [span], [h1]-[h6], [pre], [blockquote]
212
+
213
+
{b Lists:} [ul], [ol], [li], [dl], [dt], [dd]
214
+
215
+
{b Tables:} [table], [tr], [td], [th], [thead], [tbody], [tfoot]
216
+
217
+
{b Forms:} [form], [input], [button], [select], [textarea], [label]
218
+
219
+
{b Media:} [img], [audio], [video], [canvas], [svg]
220
+
221
+
@see <https://html.spec.whatwg.org/multipage/indices.html#elements-3>
222
+
WHATWG: Index of HTML elements
223
+
224
+
{3 Void Elements}
225
+
226
+
Some elements are {i void elements} - they cannot have children and have
227
+
no end tag. These include: [area], [base], [br], [col], [embed], [hr],
228
+
[img], [input], [link], [meta], [source], [track], [wbr].
229
+
230
+
@see <https://html.spec.whatwg.org/multipage/syntax.html#void-elements>
231
+
WHATWG: Void elements
232
+
233
+
{3 The Template Element}
234
+
235
+
The [<template>] element is special: its children are not rendered
236
+
directly but stored in a separate document fragment accessible via
237
+
the [template_content] field. Templates are used for client-side
238
+
templating where content is cloned and inserted via JavaScript.
239
+
240
+
@see <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>
241
+
WHATWG: The template element
242
+
*)
243
+
type node = Node.node = {
244
+
mutable name : string;
245
+
(** Tag name for elements, or special name for other node types.
246
+
247
+
For elements, this is the lowercase tag name (e.g., "div", "span").
248
+
For other node types, use the constants {!document_name},
249
+
{!text_name}, {!comment_name}, etc. *)
250
+
251
+
mutable namespace : string option;
252
+
(** Element namespace: [None] for HTML, [Some "svg"], [Some "mathml"].
253
+
254
+
Most elements are in the HTML namespace ([None]). The SVG and MathML
255
+
namespaces are only used when content appears inside [<svg>] or
256
+
[<math>] elements respectively.
257
+
258
+
@see <https://html.spec.whatwg.org/multipage/dom.html#elements-in-the-dom>
259
+
WHATWG: Elements in the DOM *)
260
+
261
+
mutable attrs : (string * string) list;
262
+
(** Element attributes as (name, value) pairs.
263
+
264
+
Attributes provide additional information about elements. Common
265
+
global attributes include:
266
+
- [id]: Unique identifier for the element
267
+
- [class]: Space-separated list of CSS class names
268
+
- [style]: Inline CSS styles
269
+
- [title]: Advisory text (shown as tooltip)
270
+
- [lang]: Language of the element's content
271
+
- [hidden]: Whether the element should be hidden
272
+
273
+
Element-specific attributes include:
274
+
- [href] on [<a>]: The link destination URL
275
+
- [src] on [<img>]: The image source URL
276
+
- [type] on [<input>]: The input control type
277
+
- [disabled] on form controls: Whether the control is disabled
278
+
279
+
In HTML5, attribute names are case-insensitive and are normalized
280
+
to lowercase by the parser.
281
+
282
+
@see <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>
283
+
WHATWG: Global attributes
284
+
@see <https://html.spec.whatwg.org/multipage/indices.html#attributes-3>
285
+
WHATWG: Index of attributes *)
286
+
287
+
mutable children : node list;
288
+
(** Child nodes in document order.
289
+
290
+
For most elements, this list contains the nested elements and text.
291
+
For void elements (like [<br>], [<img>]), this is always empty.
292
+
For [<template>] elements, the actual content is in
293
+
[template_content], not here. *)
294
+
295
+
mutable parent : node option;
296
+
(** Parent node, [None] for root nodes.
297
+
298
+
Every node except the Document node has a parent. This back-reference
299
+
enables traversing up the tree. *)
300
+
301
+
mutable data : string;
302
+
(** Text content for text and comment nodes.
303
+
304
+
For text nodes, this contains the actual text. For comment nodes,
305
+
this contains the comment text (without the [<!--] and [-->]
306
+
delimiters). For other node types, this field is empty. *)
307
+
308
+
mutable template_content : node option;
309
+
(** Document fragment for [<template>] element contents.
310
+
311
+
The [<template>] element holds "inert" content that is not
312
+
rendered but can be cloned and inserted elsewhere. This field
313
+
contains a document fragment with the template's content.
314
+
315
+
For non-template elements, this is [None].
316
+
317
+
@see <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>
318
+
WHATWG: The template element *)
319
+
320
+
mutable doctype : doctype_data option;
321
+
(** DOCTYPE information for doctype nodes.
322
+
323
+
Only doctype nodes use this field; for all other nodes it is [None]. *)
324
+
}
325
+
326
+
(** {1 Node Name Constants}
327
+
328
+
These constants identify special node types. Compare with [node.name]
329
+
to determine the node type.
330
+
*)
331
+
332
+
val document_name : string
333
+
(** ["#document"] - name for document nodes.
334
+
335
+
The Document node is the root of every HTML document tree. It represents
336
+
the entire document and is the parent of the [<html>] element.
337
+
338
+
@see <https://html.spec.whatwg.org/multipage/dom.html#document>
339
+
WHATWG: The Document object *)
340
+
341
+
val document_fragment_name : string
342
+
(** ["#document-fragment"] - name for document fragment nodes.
343
+
344
+
Document fragments are lightweight container nodes used to hold a
345
+
collection of nodes without a parent document. They are used:
346
+
- To hold [<template>] element contents
347
+
- As results of fragment parsing (innerHTML)
348
+
- For efficient batch DOM operations
349
+
350
+
@see <https://dom.spec.whatwg.org/#documentfragment>
351
+
DOM Standard: DocumentFragment *)
352
+
353
+
val text_name : string
354
+
(** ["#text"] - name for text nodes.
355
+
356
+
Text nodes contain the character data within elements. When the
357
+
parser encounters text between tags like [<p>Hello world</p>],
358
+
it creates a text node with data ["Hello world"] as a child of
359
+
the [<p>] element.
360
+
361
+
Adjacent text nodes are automatically merged by the parser. *)
362
+
363
+
val comment_name : string
364
+
(** ["#comment"] - name for comment nodes.
365
+
366
+
Comment nodes represent HTML comments: [<!-- comment text -->].
367
+
Comments are preserved in the DOM but not rendered to users.
368
+
They're useful for development notes or conditional content. *)
369
+
370
+
val doctype_name : string
371
+
(** ["!doctype"] - name for doctype nodes.
372
+
373
+
The DOCTYPE node represents the [<!DOCTYPE html>] declaration.
374
+
It is always the first child of the Document node (if present).
375
+
376
+
@see <https://html.spec.whatwg.org/multipage/syntax.html#the-doctype>
377
+
WHATWG: The DOCTYPE *)
378
+
379
+
(** {1 Constructors}
380
+
381
+
Functions to create new DOM nodes. All nodes start with no parent and
382
+
no children. Use {!append_child} or {!insert_before} to build a tree.
383
+
*)
384
+
385
+
val create_element :
386
+
string ->
387
+
?namespace:string option ->
388
+
?attrs:(string * string) list ->
389
+
unit ->
390
+
node
391
+
(** Create an element node.
392
+
393
+
Elements are the primary building blocks of HTML documents. Each
394
+
element represents a component of the document with semantic meaning.
395
+
396
+
@param name The tag name (e.g., "div", "p", "span"). Tag names are
397
+
case-insensitive in HTML; by convention, use lowercase.
398
+
@param namespace Element namespace:
399
+
- [None] (default): HTML namespace for standard elements
400
+
- [Some "svg"]: SVG namespace for graphics elements
401
+
- [Some "mathml"]: MathML namespace for mathematical notation
402
+
@param attrs Initial attributes as [(name, value)] pairs
403
+
404
+
{b Examples:}
405
+
{[
406
+
(* Simple HTML element *)
407
+
let div = create_element "div" ()
408
+
409
+
(* Element with attributes *)
410
+
let link = create_element "a"
411
+
~attrs:[("href", "https://example.com"); ("class", "external")]
412
+
()
413
+
414
+
(* SVG element *)
415
+
let rect = create_element "rect"
416
+
~namespace:(Some "svg")
417
+
~attrs:[("width", "100"); ("height", "50"); ("fill", "blue")]
418
+
()
419
+
]}
420
+
421
+
@see <https://html.spec.whatwg.org/multipage/dom.html#elements-in-the-dom>
422
+
WHATWG: Elements in the DOM
423
+
*)
424
+
425
+
val create_text : string -> node
426
+
(** Create a text node with the given content.
427
+
428
+
Text nodes contain the readable content of HTML documents. They
429
+
appear as children of elements and represent the characters that
430
+
users see.
431
+
432
+
{b Note:} Text content is stored as-is. Character references like
433
+
[&] should already be decoded to their character values.
434
+
435
+
{b Example:}
436
+
{[
437
+
let text = create_text "Hello, world!"
438
+
(* To put text in a paragraph: *)
439
+
let p = create_element "p" () in
440
+
append_child p text
441
+
]}
442
+
*)
443
+
444
+
val create_comment : string -> node
445
+
(** Create a comment node with the given content.
446
+
447
+
Comments are human-readable notes in HTML that don't appear in
448
+
the rendered output. They're written as [<!-- comment -->] in HTML.
449
+
450
+
@param data The comment text (without the [<!--] and [-->] delimiters)
451
+
452
+
{b Example:}
453
+
{[
454
+
let comment = create_comment " TODO: Add navigation "
455
+
(* Represents: <!-- TODO: Add navigation --> *)
456
+
]}
457
+
458
+
@see <https://html.spec.whatwg.org/multipage/syntax.html#comments>
459
+
WHATWG: HTML comments
460
+
*)
461
+
462
+
val create_document : unit -> node
463
+
(** Create an empty document node.
464
+
465
+
The Document node is the root of an HTML document tree. It represents
466
+
the entire document and serves as the parent for the DOCTYPE (if any)
467
+
and the root [<html>] element.
468
+
469
+
In a complete HTML document, the structure is:
470
+
{v
471
+
#document
472
+
├── !doctype
473
+
└── html
474
+
├── head
475
+
└── body
476
+
v}
477
+
478
+
@see <https://html.spec.whatwg.org/multipage/dom.html#document>
479
+
WHATWG: The Document object
480
+
*)
481
+
482
+
val create_document_fragment : unit -> node
483
+
(** Create an empty document fragment.
484
+
485
+
Document fragments are lightweight containers that can hold multiple
486
+
nodes without being part of the main document tree. They're useful for:
487
+
488
+
- {b Template contents:} The [<template>] element stores its children
489
+
in a document fragment, keeping them inert until cloned
490
+
491
+
- {b Fragment parsing:} When parsing HTML fragments (like innerHTML),
492
+
the result is placed in a document fragment
493
+
494
+
- {b Batch operations:} Build a subtree in a fragment, then insert it
495
+
into the document in one operation for better performance
496
+
497
+
@see <https://dom.spec.whatwg.org/#documentfragment>
498
+
DOM Standard: DocumentFragment
499
+
*)
500
+
501
+
val create_doctype :
502
+
?name:string -> ?public_id:string -> ?system_id:string -> unit -> node
503
+
(** Create a DOCTYPE node.
504
+
505
+
The DOCTYPE declaration tells browsers to use standards mode for
506
+
rendering. For HTML5 documents, use:
507
+
508
+
{[
509
+
let doctype = create_doctype ~name:"html" ()
510
+
(* Represents: <!DOCTYPE html> *)
511
+
]}
512
+
513
+
@param name DOCTYPE name (usually ["html"] for HTML documents)
514
+
@param public_id Public identifier (legacy, rarely needed)
515
+
@param system_id System identifier (legacy, rarely needed)
516
+
517
+
{b Legacy example:}
518
+
{[
519
+
(* HTML 4.01 Strict DOCTYPE - not recommended for new documents *)
520
+
let legacy = create_doctype
521
+
~name:"HTML"
522
+
~public_id:"-//W3C//DTD HTML 4.01//EN"
523
+
~system_id:"http://www.w3.org/TR/html4/strict.dtd"
524
+
()
525
+
]}
526
+
527
+
@see <https://html.spec.whatwg.org/multipage/syntax.html#the-doctype>
528
+
WHATWG: The DOCTYPE
529
+
*)
530
+
531
+
val create_template :
532
+
?namespace:string option -> ?attrs:(string * string) list -> unit -> node
533
+
(** Create a [<template>] element with its content document fragment.
534
+
535
+
The [<template>] element holds inert HTML content that is not
536
+
rendered directly. The content is stored in a separate document
537
+
fragment and can be:
538
+
- Cloned and inserted into the document via JavaScript
539
+
- Used as a stamping template for repeated content
540
+
- Pre-parsed without affecting the page
541
+
542
+
{b How templates work:}
543
+
544
+
Unlike normal elements, a [<template>]'s children are not rendered.
545
+
Instead, they're stored in the [template_content] field. This means:
546
+
- Images inside won't load
547
+
- Scripts inside won't execute
548
+
- The content is "inert" until explicitly activated
549
+
550
+
{b Example:}
551
+
{[
552
+
let template = create_template () in
553
+
let div = create_element "div" () in
554
+
let text = create_text "Template content" in
555
+
append_child div text;
556
+
(* Add to template's content fragment, not children *)
557
+
match template.template_content with
558
+
| Some fragment -> append_child fragment div
559
+
| None -> ()
560
+
]}
561
+
562
+
@see <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>
563
+
WHATWG: The template element
564
+
*)
565
+
566
+
(** {1 Node Type Predicates}
567
+
568
+
Functions to test what type of node you have. Since all nodes use the
569
+
same record type, these predicates check the [name] field to determine
570
+
the actual node type.
571
+
*)
572
+
573
+
val is_element : node -> bool
574
+
(** [is_element node] returns [true] if the node is an element node.
575
+
576
+
Elements are HTML tags like [<div>], [<p>], [<a>]. They are
577
+
identified by having a tag name that doesn't match any of the
578
+
special node name constants.
579
+
*)
580
+
581
+
val is_text : node -> bool
582
+
(** [is_text node] returns [true] if the node is a text node.
583
+
584
+
Text nodes contain the character content within elements.
585
+
They have [name = "#text"]. *)
586
+
587
+
val is_comment : node -> bool
588
+
(** [is_comment node] returns [true] if the node is a comment node.
589
+
590
+
Comment nodes represent HTML comments [<!-- ... -->].
591
+
They have [name = "#comment"]. *)
592
+
593
+
val is_document : node -> bool
594
+
(** [is_document node] returns [true] if the node is a document node.
595
+
596
+
The document node is the root of the DOM tree.
597
+
It has [name = "#document"]. *)
598
+
599
+
val is_document_fragment : node -> bool
600
+
(** [is_document_fragment node] returns [true] if the node is a document fragment.
601
+
602
+
Document fragments are lightweight containers.
603
+
They have [name = "#document-fragment"]. *)
604
+
605
+
val is_doctype : node -> bool
606
+
(** [is_doctype node] returns [true] if the node is a DOCTYPE node.
607
+
608
+
DOCTYPE nodes represent the [<!DOCTYPE>] declaration.
609
+
They have [name = "!doctype"]. *)
610
+
611
+
val has_children : node -> bool
612
+
(** [has_children node] returns [true] if the node has any children.
613
+
614
+
Note: For [<template>] elements, this checks the direct children list,
615
+
not the template content fragment. *)
616
+
617
+
(** {1 Tree Manipulation}
618
+
619
+
Functions to modify the DOM tree structure. These functions automatically
620
+
maintain parent/child references, ensuring the tree remains consistent.
621
+
*)
622
+
623
+
val append_child : node -> node -> unit
624
+
(** [append_child parent child] adds [child] as the last child of [parent].
625
+
626
+
The child's parent reference is updated to point to [parent].
627
+
If the child already has a parent, it is first removed from that parent.
628
+
629
+
{b Example:}
630
+
{[
631
+
let body = create_element "body" () in
632
+
let p = create_element "p" () in
633
+
let text = create_text "Hello!" in
634
+
append_child p text;
635
+
append_child body p
636
+
(* Result:
637
+
body
638
+
└── p
639
+
└── #text "Hello!"
640
+
*)
641
+
]}
642
+
*)
643
+
644
+
val insert_before : node -> node -> node -> unit
645
+
(** [insert_before parent new_child ref_child] inserts [new_child] before
646
+
[ref_child] in [parent]'s children.
647
+
648
+
@param parent The parent node
649
+
@param new_child The node to insert
650
+
@param ref_child The existing child to insert before
651
+
652
+
Raises [Not_found] if [ref_child] is not a child of [parent].
653
+
654
+
{b Example:}
655
+
{[
656
+
let ul = create_element "ul" () in
657
+
let li1 = create_element "li" () in
658
+
let li3 = create_element "li" () in
659
+
append_child ul li1;
660
+
append_child ul li3;
661
+
let li2 = create_element "li" () in
662
+
insert_before ul li2 li3
663
+
(* Result: ul contains li1, li2, li3 in that order *)
664
+
]}
665
+
*)
666
+
667
+
val remove_child : node -> node -> unit
668
+
(** [remove_child parent child] removes [child] from [parent]'s children.
669
+
670
+
The child's parent reference is set to [None].
671
+
672
+
Raises [Not_found] if [child] is not a child of [parent].
673
+
*)
674
+
675
+
val insert_text_at : node -> string -> node option -> unit
676
+
(** [insert_text_at parent text before_node] inserts text content.
677
+
678
+
If [before_node] is [None], appends at the end. If the previous sibling
679
+
is a text node, the text is merged into it (text nodes are coalesced).
680
+
Otherwise, a new text node is created.
681
+
682
+
This implements the HTML5 parser's text insertion algorithm which
683
+
ensures adjacent text nodes are always merged, matching browser behavior.
684
+
685
+
@see <https://html.spec.whatwg.org/multipage/parsing.html#appropriate-place-for-inserting-a-node>
686
+
WHATWG: Inserting text in the DOM
687
+
*)
688
+
689
+
(** {1 Attribute Operations}
690
+
691
+
Functions to read and modify element attributes. Attributes are
692
+
name-value pairs that provide additional information about elements.
693
+
694
+
In HTML5, attribute names are case-insensitive and normalized to
695
+
lowercase by the parser.
696
+
697
+
@see <https://html.spec.whatwg.org/multipage/dom.html#attributes>
698
+
WHATWG: Attributes
699
+
*)
700
+
701
+
val get_attr : node -> string -> string option
702
+
(** [get_attr node name] returns the value of attribute [name], or [None]
703
+
if the attribute doesn't exist.
704
+
705
+
Attribute lookup is case-sensitive on the stored (lowercase) names.
706
+
*)
707
+
708
+
val set_attr : node -> string -> string -> unit
709
+
(** [set_attr node name value] sets attribute [name] to [value].
710
+
711
+
If the attribute already exists, it is replaced.
712
+
If it doesn't exist, it is added.
713
+
*)
714
+
715
+
val has_attr : node -> string -> bool
716
+
(** [has_attr node name] returns [true] if the node has attribute [name]. *)
717
+
718
+
(** {1 Tree Traversal}
719
+
720
+
Functions to navigate the DOM tree.
721
+
*)
722
+
723
+
val descendants : node -> node list
724
+
(** [descendants node] returns all descendant nodes in document order.
725
+
726
+
This performs a depth-first traversal, returning children before
727
+
siblings at each level. The node itself is not included.
728
+
729
+
{b Document order} is the order nodes appear in the HTML source:
730
+
parent before children, earlier siblings before later ones.
731
+
732
+
{b Example:}
733
+
{[
734
+
(* For tree: div > (p > "hello", span > "world") *)
735
+
descendants div
736
+
(* Returns: [p; text("hello"); span; text("world")] *)
737
+
]}
738
+
*)
739
+
740
+
val ancestors : node -> node list
741
+
(** [ancestors node] returns all ancestor nodes from parent to root.
742
+
743
+
The first element is the immediate parent, the last is the root
744
+
(usually the Document node).
745
+
746
+
{b Example:}
747
+
{[
748
+
(* For a text node inside: html > body > p > text *)
749
+
ancestors text_node
750
+
(* Returns: [p; body; html; #document] *)
751
+
]}
752
+
*)
753
+
754
+
val get_text_content : node -> string
755
+
(** [get_text_content node] returns the concatenated text content.
756
+
757
+
For text nodes, returns the text data directly.
758
+
For elements, recursively concatenates all descendant text content.
759
+
For other node types, returns an empty string.
760
+
761
+
{b Example:}
762
+
{[
763
+
(* For: <p>Hello <b>world</b>!</p> *)
764
+
get_text_content p_element
765
+
(* Returns: "Hello world!" *)
766
+
]}
767
+
*)
768
+
769
+
(** {1 Cloning} *)
770
+
771
+
val clone : ?deep:bool -> node -> node
772
+
(** [clone ?deep node] creates a copy of the node.
773
+
774
+
@param deep If [true], recursively clone all descendants (default: [false])
775
+
776
+
The cloned node has no parent. With [deep:false], only the node itself
777
+
is copied (with its attributes, but not its children).
778
+
779
+
{b Example:}
780
+
{[
781
+
let original = create_element "div" ~attrs:[("class", "box")] () in
782
+
let shallow = clone original in
783
+
let deep = clone ~deep:true original
784
+
]}
785
+
*)
786
+
787
+
(** {1 Serialization} *)
788
+
789
+
val to_html : ?pretty:bool -> ?indent_size:int -> ?indent:int -> node -> string
790
+
(** [to_html ?pretty ?indent_size ?indent node] converts a DOM node to an
791
+
HTML string.
792
+
793
+
@param pretty If [true] (default), format with indentation and newlines
794
+
@param indent_size Number of spaces per indentation level (default: 2)
795
+
@param indent Starting indentation level (default: 0)
796
+
@return The HTML string representation of the node
797
+
*)
798
+
799
+
val to_writer :
800
+
?pretty:bool ->
801
+
?indent_size:int ->
802
+
?indent:int ->
803
+
Bytesrw.Bytes.Writer.t ->
804
+
node ->
805
+
unit
806
+
(** [to_writer ?pretty ?indent_size ?indent writer node] streams a DOM node
807
+
as HTML to a bytes writer.
808
+
809
+
This is more memory-efficient than {!to_html} for large documents as it
810
+
doesn't build intermediate strings.
811
+
812
+
@param pretty If [true] (default), format with indentation and newlines
813
+
@param indent_size Number of spaces per indentation level (default: 2)
814
+
@param indent Starting indentation level (default: 0)
815
+
@param writer The bytes writer to output to
816
+
*)
817
+
818
+
val to_test_format : ?indent:int -> node -> string
819
+
(** [to_test_format ?indent node] converts a DOM node to the html5lib test
820
+
format.
821
+
822
+
This format is used by the html5lib test suite for comparing parser
823
+
output. It represents the DOM tree in a human-readable, line-based format.
824
+
825
+
@param indent Starting indentation level (default: 0)
826
+
@return The test format string representation
827
+
*)
828
+
829
+
val to_text : ?separator:string -> ?strip:bool -> node -> string
830
+
(** [to_text ?separator ?strip node] extracts all text content from a node.
831
+
832
+
Recursively collects text from all descendant text nodes.
833
+
834
+
@param separator String to insert between text nodes (default: [" "])
835
+
@param strip If [true] (default), trim whitespace from result
836
+
@return The concatenated text content
837
+
*)
+101
lib/encoding/html5rw_encoding.mli
+101
lib/encoding/html5rw_encoding.mli
···
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** HTML5 Encoding Detection and Decoding
7
+
8
+
This module implements the WHATWG encoding sniffing and decoding
9
+
algorithms for HTML5 documents. It handles automatic character
10
+
encoding detection from byte order marks (BOM), meta charset
11
+
declarations, and transport layer hints.
12
+
13
+
{2 Encoding Detection Algorithm}
14
+
15
+
The encoding detection follows the WHATWG specification:
16
+
1. Check for a BOM (UTF-8, UTF-16LE, UTF-16BE)
17
+
2. Prescan for [<meta charset>] or [<meta http-equiv="content-type">]
18
+
3. Use transport layer encoding hint if provided
19
+
4. Fall back to UTF-8 as the default
20
+
21
+
@see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
22
+
WHATWG encoding sniffing algorithm
23
+
*)
24
+
25
+
(** {1 Types} *)
26
+
27
+
(** Character encodings supported by the parser.
28
+
29
+
The HTML5 specification requires support for a large number of
30
+
encodings, but this implementation focuses on the most common ones.
31
+
Other encodings are mapped to their closest equivalent.
32
+
*)
33
+
type encoding = Encoding.t =
34
+
| Utf8 (** UTF-8 encoding (default) *)
35
+
| Utf16le (** UTF-16 little-endian *)
36
+
| Utf16be (** UTF-16 big-endian *)
37
+
| Windows_1252 (** Windows-1252 (Latin-1 superset) *)
38
+
| Iso_8859_2 (** ISO-8859-2 (Central European) *)
39
+
| Euc_jp (** EUC-JP (Japanese) *)
40
+
41
+
(** {1 Encoding Utilities} *)
42
+
43
+
val encoding_to_string : encoding -> string
44
+
(** Convert an encoding to its canonical label string.
45
+
46
+
Returns the WHATWG canonical name, e.g., ["utf-8"], ["utf-16le"].
47
+
*)
48
+
49
+
val sniff_bom : bytes -> (encoding * int) option
50
+
(** Detect encoding from a byte order mark.
51
+
52
+
Examines the first bytes of the input for a BOM and returns the
53
+
detected encoding with the number of bytes to skip.
54
+
55
+
@return [(Some (encoding, skip_bytes))] if a BOM is found,
56
+
[None] otherwise.
57
+
*)
58
+
59
+
val normalize_label : string -> encoding option
60
+
(** Normalize an encoding label to its canonical form.
61
+
62
+
Maps encoding labels (case-insensitive, with optional whitespace)
63
+
to the supported encoding types.
64
+
65
+
@return [Some encoding] if the label is recognized, [None] otherwise.
66
+
67
+
{[
68
+
normalize_label "UTF-8" (* Some Utf8 *)
69
+
normalize_label "utf8" (* Some Utf8 *)
70
+
normalize_label "latin1" (* Some Windows_1252 *)
71
+
]}
72
+
*)
73
+
74
+
val prescan_for_meta_charset : bytes -> encoding option
75
+
(** Prescan bytes to find a meta charset declaration.
76
+
77
+
Implements the WHATWG prescan algorithm that looks for encoding
78
+
declarations in the first 1024 bytes of an HTML document.
79
+
80
+
@return [Some encoding] if a meta charset is found, [None] otherwise.
81
+
*)
82
+
83
+
(** {1 Decoding} *)
84
+
85
+
val decode : bytes -> ?transport_encoding:string -> unit -> string * encoding
86
+
(** Decode raw bytes to a UTF-8 string with automatic encoding detection.
87
+
88
+
This function implements the full encoding sniffing algorithm:
89
+
1. Check for BOM
90
+
2. Prescan for meta charset
91
+
3. Use transport encoding hint if provided
92
+
4. Fall back to UTF-8
93
+
94
+
@param transport_encoding Encoding hint from HTTP Content-Type header
95
+
@return [(decoded_string, detected_encoding)]
96
+
97
+
{[
98
+
let (html, enc) = decode raw_bytes ()
99
+
(* html is now a UTF-8 string, enc is the detected encoding *)
100
+
]}
101
+
*)
+155
lib/selector/html5rw_selector.mli
+155
lib/selector/html5rw_selector.mli
···
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** CSS Selector Engine
7
+
8
+
This module provides CSS selector parsing and matching for querying
9
+
the HTML5 DOM. It supports a subset of CSS3 selectors suitable for
10
+
common web scraping and DOM manipulation tasks.
11
+
12
+
{2 Supported Selectors}
13
+
14
+
{3 Simple Selectors}
15
+
- Tag: [div], [p], [span]
16
+
- ID: [#myid]
17
+
- Class: [.myclass]
18
+
- Universal: [*]
19
+
20
+
{3 Attribute Selectors}
21
+
- Presence: [[attr]]
22
+
- Exact match: [[attr="value"]]
23
+
- Contains word: [[attr~="value"]]
24
+
- Starts with: [[attr^="value"]]
25
+
- Ends with: [[attr$="value"]]
26
+
- Contains: [[attr*="value"]]
27
+
- Hyphen-separated: [[attr|="value"]]
28
+
29
+
{3 Pseudo-classes}
30
+
- [:first-child], [:last-child]
31
+
- [:nth-child(n)], [:nth-last-child(n)]
32
+
- [:only-child]
33
+
- [:empty]
34
+
- [:not(selector)]
35
+
36
+
{3 Combinators}
37
+
- Descendant: [div p] (p anywhere inside div)
38
+
- Child: [div > p] (p direct child of div)
39
+
- Adjacent sibling: [div + p] (p immediately after div)
40
+
- General sibling: [div ~ p] (p after div, same parent)
41
+
42
+
{2 Usage}
43
+
44
+
{[
45
+
let doc = Html5rw.parse reader in
46
+
47
+
(* Find all paragraphs *)
48
+
let paragraphs = Html5rw.query doc "p" in
49
+
50
+
(* Find links with specific class *)
51
+
let links = Html5rw.query doc "a.external" in
52
+
53
+
(* Find table cells in rows *)
54
+
let cells = Html5rw.query doc "tr > td" in
55
+
56
+
(* Check if a node matches *)
57
+
let is_active = Html5rw.matches node ".active"
58
+
]}
59
+
*)
60
+
61
+
(** {1 Exceptions} *)
62
+
63
+
exception Selector_error of string
64
+
(** Raised when a selector string is malformed.
65
+
66
+
The exception contains an error message describing the parse error.
67
+
*)
68
+
69
+
(** {1 Sub-modules} *)
70
+
71
+
(** Abstract syntax tree for parsed selectors. *)
72
+
module Ast : sig
73
+
type simple_selector_type = Selector_ast.simple_selector_type =
74
+
| Type_tag
75
+
| Type_id
76
+
| Type_class
77
+
| Type_universal
78
+
| Type_attr
79
+
| Type_pseudo
80
+
81
+
type simple_selector = Selector_ast.simple_selector = {
82
+
selector_type : simple_selector_type;
83
+
name : string option;
84
+
operator : string option;
85
+
value : string option;
86
+
arg : string option;
87
+
}
88
+
89
+
type compound_selector = Selector_ast.compound_selector = {
90
+
selectors : simple_selector list;
91
+
}
92
+
93
+
type complex_selector = Selector_ast.complex_selector = {
94
+
parts : (string option * compound_selector) list;
95
+
}
96
+
97
+
type selector_list = Selector_ast.selector_list = {
98
+
selectors : complex_selector list;
99
+
}
100
+
101
+
type selector = Selector_ast.selector =
102
+
| Simple of simple_selector
103
+
| Compound of compound_selector
104
+
| Complex of complex_selector
105
+
| List of selector_list
106
+
107
+
val make_simple :
108
+
simple_selector_type ->
109
+
?name:string ->
110
+
?operator:string ->
111
+
?value:string ->
112
+
?arg:string ->
113
+
unit ->
114
+
simple_selector
115
+
116
+
val make_compound : simple_selector list -> compound_selector
117
+
val make_complex : (string option * compound_selector) list -> complex_selector
118
+
val make_list : complex_selector list -> selector_list
119
+
end
120
+
121
+
(** Token types for the selector lexer. *)
122
+
module Token : sig
123
+
type t = Selector_token.t
124
+
end
125
+
126
+
(** {1 Functions} *)
127
+
128
+
val parse : string -> Ast.selector
129
+
(** Parse a CSS selector string.
130
+
131
+
@raise Selector_error if the selector is malformed.
132
+
*)
133
+
134
+
val query : Html5rw_dom.node -> string -> Html5rw_dom.node list
135
+
(** Query the DOM tree with a CSS selector.
136
+
137
+
Returns all nodes matching the selector in document order.
138
+
139
+
@raise Selector_error if the selector is malformed.
140
+
141
+
{[
142
+
let divs = query root_node "div.content > p"
143
+
]}
144
+
*)
145
+
146
+
val matches : Html5rw_dom.node -> string -> bool
147
+
(** Check if a node matches a CSS selector.
148
+
149
+
@raise Selector_error if the selector is malformed.
150
+
151
+
{[
152
+
if matches node ".active" then
153
+
(* node has class "active" *)
154
+
]}
155
+
*)
+223
lib/tokenizer/html5rw_tokenizer.mli
+223
lib/tokenizer/html5rw_tokenizer.mli
···
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** HTML5 Tokenizer
7
+
8
+
This module implements the WHATWG HTML5 tokenization algorithm. The
9
+
tokenizer converts an input byte stream into a sequence of tokens
10
+
(start tags, end tags, text, comments, doctypes) that can be consumed
11
+
by a tree builder.
12
+
*)
13
+
14
+
(** {1 Sub-modules} *)
15
+
16
+
(** Token types produced by the tokenizer. *)
17
+
module Token : sig
18
+
type tag_kind = Token.tag_kind = Start | End
19
+
20
+
type doctype = Token.doctype = {
21
+
name : string option;
22
+
public_id : string option;
23
+
system_id : string option;
24
+
force_quirks : bool;
25
+
}
26
+
27
+
type tag = Token.tag = {
28
+
kind : tag_kind;
29
+
name : string;
30
+
attrs : (string * string) list;
31
+
self_closing : bool;
32
+
}
33
+
34
+
type t = Token.t =
35
+
| Tag of tag
36
+
| Character of string
37
+
| Comment of string
38
+
| Doctype of doctype
39
+
| EOF
40
+
41
+
val make_start_tag : string -> (string * string) list -> bool -> t
42
+
val make_end_tag : string -> t
43
+
val make_doctype :
44
+
?name:string ->
45
+
?public_id:string ->
46
+
?system_id:string ->
47
+
?force_quirks:bool ->
48
+
unit ->
49
+
t
50
+
val make_comment : string -> t
51
+
val make_character : string -> t
52
+
val eof : t
53
+
end
54
+
55
+
(** Tokenizer states. *)
56
+
module State : sig
57
+
type t = State.t =
58
+
| Data
59
+
| Rcdata
60
+
| Rawtext
61
+
| Script_data
62
+
| Plaintext
63
+
| Tag_open
64
+
| End_tag_open
65
+
| Tag_name
66
+
| Rcdata_less_than_sign
67
+
| Rcdata_end_tag_open
68
+
| Rcdata_end_tag_name
69
+
| Rawtext_less_than_sign
70
+
| Rawtext_end_tag_open
71
+
| Rawtext_end_tag_name
72
+
| Script_data_less_than_sign
73
+
| Script_data_end_tag_open
74
+
| Script_data_end_tag_name
75
+
| Script_data_escape_start
76
+
| Script_data_escape_start_dash
77
+
| Script_data_escaped
78
+
| Script_data_escaped_dash
79
+
| Script_data_escaped_dash_dash
80
+
| Script_data_escaped_less_than_sign
81
+
| Script_data_escaped_end_tag_open
82
+
| Script_data_escaped_end_tag_name
83
+
| Script_data_double_escape_start
84
+
| Script_data_double_escaped
85
+
| Script_data_double_escaped_dash
86
+
| Script_data_double_escaped_dash_dash
87
+
| Script_data_double_escaped_less_than_sign
88
+
| Script_data_double_escape_end
89
+
| Before_attribute_name
90
+
| Attribute_name
91
+
| After_attribute_name
92
+
| Before_attribute_value
93
+
| Attribute_value_double_quoted
94
+
| Attribute_value_single_quoted
95
+
| Attribute_value_unquoted
96
+
| After_attribute_value_quoted
97
+
| Self_closing_start_tag
98
+
| Bogus_comment
99
+
| Markup_declaration_open
100
+
| Comment_start
101
+
| Comment_start_dash
102
+
| Comment
103
+
| Comment_less_than_sign
104
+
| Comment_less_than_sign_bang
105
+
| Comment_less_than_sign_bang_dash
106
+
| Comment_less_than_sign_bang_dash_dash
107
+
| Comment_end_dash
108
+
| Comment_end
109
+
| Comment_end_bang
110
+
| Doctype
111
+
| Before_doctype_name
112
+
| Doctype_name
113
+
| After_doctype_name
114
+
| After_doctype_public_keyword
115
+
| Before_doctype_public_identifier
116
+
| Doctype_public_identifier_double_quoted
117
+
| Doctype_public_identifier_single_quoted
118
+
| After_doctype_public_identifier
119
+
| Between_doctype_public_and_system_identifiers
120
+
| After_doctype_system_keyword
121
+
| Before_doctype_system_identifier
122
+
| Doctype_system_identifier_double_quoted
123
+
| Doctype_system_identifier_single_quoted
124
+
| After_doctype_system_identifier
125
+
| Bogus_doctype
126
+
| Cdata_section
127
+
| Cdata_section_bracket
128
+
| Cdata_section_end
129
+
| Character_reference
130
+
| Named_character_reference
131
+
| Ambiguous_ampersand
132
+
| Numeric_character_reference
133
+
| Hexadecimal_character_reference_start
134
+
| Decimal_character_reference_start
135
+
| Hexadecimal_character_reference
136
+
| Decimal_character_reference
137
+
| Numeric_character_reference_end
138
+
end
139
+
140
+
(** Parse error types. *)
141
+
module Errors : sig
142
+
type t = Errors.t = {
143
+
code : string;
144
+
line : int;
145
+
column : int;
146
+
}
147
+
148
+
val make : code:string -> line:int -> column:int -> t
149
+
val to_string : t -> string
150
+
end
151
+
152
+
(** Input stream with position tracking. *)
153
+
module Stream : sig
154
+
type t = Stream.t
155
+
156
+
val create : string -> t
157
+
val create_from_reader : Bytesrw.Bytes.Reader.t -> t
158
+
val set_error_callback : t -> (string -> unit) -> unit
159
+
val position : t -> int * int
160
+
end
161
+
162
+
(** {1 Token Sink Interface} *)
163
+
164
+
(** Interface for token consumers.
165
+
166
+
The tokenizer calls [process] for each token it produces. The sink
167
+
can return [`Continue] to keep tokenizing, or [`SwitchTo state] to
168
+
change the tokenizer state (used by the tree builder for things like
169
+
[<script>] and [<textarea>]).
170
+
*)
171
+
module type SINK = sig
172
+
type t
173
+
val process : t -> Token.t -> [ `Continue | `SwitchTo of State.t ]
174
+
val adjusted_current_node_in_html_namespace : t -> bool
175
+
end
176
+
177
+
(** {1 Tokenizer} *)
178
+
179
+
(** The tokenizer type, parameterized by the sink type. *)
180
+
type 'sink t
181
+
182
+
val create :
183
+
(module SINK with type t = 'sink) ->
184
+
'sink ->
185
+
?collect_errors:bool ->
186
+
?xml_mode:bool ->
187
+
unit ->
188
+
'sink t
189
+
(** Create a new tokenizer.
190
+
191
+
@param sink The token sink that will receive tokens
192
+
@param collect_errors If [true], collect parse errors (default: [false])
193
+
@param xml_mode If [true], apply XML compatibility transformations
194
+
*)
195
+
196
+
val run :
197
+
'sink t ->
198
+
(module SINK with type t = 'sink) ->
199
+
Bytesrw.Bytes.Reader.t ->
200
+
unit
201
+
(** Run the tokenizer on the given input.
202
+
203
+
The tokenizer will read from the reader and call the sink's [process]
204
+
function for each token until EOF is reached.
205
+
*)
206
+
207
+
val get_errors : 'sink t -> Errors.t list
208
+
(** Get the list of parse errors encountered during tokenization.
209
+
210
+
Only populated if [collect_errors:true] was passed to {!create}.
211
+
*)
212
+
213
+
val set_state : 'sink t -> State.t -> unit
214
+
(** Set the tokenizer state.
215
+
216
+
Used by the tree builder to switch states for raw text elements.
217
+
*)
218
+
219
+
val set_last_start_tag : 'sink t -> string -> unit
220
+
(** Set the last start tag name.
221
+
222
+
Used by the tree builder to track the context for end tag matching.
223
+
*)