OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** CSS Selector Engine
7
8 This module provides CSS selector parsing and matching for querying
9 the HTML5 DOM. It supports a subset of CSS3 selectors suitable for
10 common web scraping and DOM manipulation tasks.
11
12 {2 Supported Selectors}
13
14 {3 Simple Selectors}
15 - Tag: [div], [p], [span]
16 - ID: [#myid]
17 - Class: [.myclass]
18 - Universal: [*]
19
20 {3 Attribute Selectors}
21 - Presence: [[attr]]
22 - Exact match: [[attr="value"]]
23 - Contains word: [[attr~="value"]]
24 - Starts with: [[attr^="value"]]
25 - Ends with: [[attr$="value"]]
26 - Contains: [[attr*="value"]]
27 - Hyphen-separated: [[attr|="value"]]
28
29 {3 Pseudo-classes}
30 - [:first-child], [:last-child]
31 - [:nth-child(n)], [:nth-last-child(n)]
32 - [:only-child]
33 - [:empty]
34 - [:not(selector)]
35
36 {3 Combinators}
37 - Descendant: [div p] (p anywhere inside div)
38 - Child: [div > p] (p direct child of div)
39 - Adjacent sibling: [div + p] (p immediately after div)
40 - General sibling: [div ~ p] (p after div, same parent)
41
42 {2 Usage}
43
44 {[
45 let doc = Html5rw.parse reader in
46
47 (* Find all paragraphs *)
48 let paragraphs = Html5rw.query doc "p" in
49
50 (* Find links with specific class *)
51 let links = Html5rw.query doc "a.external" in
52
53 (* Find table cells in rows *)
54 let cells = Html5rw.query doc "tr > td" in
55
56 (* Check if a node matches *)
57 let is_active = Html5rw.matches node ".active"
58 ]}
59*)
60
61(** {1 Error Types} *)
62
63(** CSS selector error codes.
64
65 This module provides the {!Error_code.t} variant type that represents
66 all possible errors when parsing CSS selectors.
67*)
68module Error_code : sig
69 type t =
70 | Empty_selector
71 (** The selector string was empty or contained only whitespace. *)
72 | Unterminated_string
73 (** A quoted string was not closed before end of input. *)
74 | Unterminated_escape
75 (** An escape sequence was not completed before end of input. *)
76 | Expected_identifier_after_hash
77 (** Expected an identifier after [#] for ID selector. *)
78 | Expected_identifier_after_dot
79 (** Expected an identifier after [.] for class selector. *)
80 | Expected_attribute_name
81 (** Expected an attribute name inside an attribute selector. *)
82 | Expected_closing_bracket
83 (** Expected [\]] to close an attribute selector. *)
84 | Expected_equals_after_operator of char
85 (** Expected [=] after an attribute operator like [~], [|], [^], [$], or [*]. *)
86 | Unexpected_character_in_attribute_selector
87 (** Found an unexpected character inside an attribute selector. *)
88 | Expected_pseudo_class_name
89 (** Expected a pseudo-class name after [:]. *)
90 | Expected_closing_paren
91 (** Expected [)] to close a pseudo-class argument. *)
92 | Unexpected_character of char
93 (** Found an unexpected character in the selector. *)
94 | Expected_attribute_value
95 (** Expected a value after the attribute operator. *)
96 | Expected_closing_bracket_or_operator
97 (** Expected [\]] or an attribute operator like [=]. *)
98 | Expected_selector_after_combinator
99 (** Expected a selector after a combinator ([>], [+], [~], or space). *)
100 | Unexpected_token
101 (** Found an unexpected token in the selector. *)
102 | Expected_end_of_selector
103 (** Expected end of selector but found more tokens. *)
104
105 val to_string : t -> string
106 (** Convert to a kebab-case string identifier suitable for programmatic use. *)
107
108 val to_human_string : t -> string
109 (** Convert to a human-readable error message. *)
110
111 val pp : Format.formatter -> t -> unit
112 (** Pretty-print a selector error code. *)
113end
114
115(** {1 Exceptions} *)
116
117exception Selector_error of Error_code.t
118(** Raised when a selector string is malformed.
119
120 The exception contains a typed error code describing the parse error.
121 Use {!Error_code.to_string} or {!Error_code.to_human_string} to get
122 a string representation.
123*)
124
125(** {1 Sub-modules} *)
126
127(** Abstract syntax tree for parsed selectors. *)
128module Ast : sig
129 type simple_selector_type = Selector_ast.simple_selector_type =
130 | Type_tag
131 | Type_id
132 | Type_class
133 | Type_universal
134 | Type_attr
135 | Type_pseudo
136
137 type simple_selector = Selector_ast.simple_selector = {
138 selector_type : simple_selector_type;
139 name : string option;
140 operator : string option;
141 value : string option;
142 arg : string option;
143 }
144
145 type compound_selector = Selector_ast.compound_selector = {
146 selectors : simple_selector list;
147 }
148
149 type complex_selector = Selector_ast.complex_selector = {
150 parts : (string option * compound_selector) list;
151 }
152
153 type selector_list = Selector_ast.selector_list = {
154 selectors : complex_selector list;
155 }
156
157 type selector = Selector_ast.selector =
158 | Simple of simple_selector
159 | Compound of compound_selector
160 | Complex of complex_selector
161 | List of selector_list
162
163 val make_simple :
164 simple_selector_type ->
165 ?name:string ->
166 ?operator:string ->
167 ?value:string ->
168 ?arg:string ->
169 unit ->
170 simple_selector
171
172 val make_compound : simple_selector list -> compound_selector
173 val make_complex : (string option * compound_selector) list -> complex_selector
174 val make_list : complex_selector list -> selector_list
175
176 val pp_simple_selector_type : Format.formatter -> simple_selector_type -> unit
177 (** Pretty-print a simple selector type. *)
178
179 val pp_simple_selector : Format.formatter -> simple_selector -> unit
180 (** Pretty-print a simple selector. *)
181
182 val pp_compound_selector : Format.formatter -> compound_selector -> unit
183 (** Pretty-print a compound selector. *)
184
185 val pp_complex_selector : Format.formatter -> complex_selector -> unit
186 (** Pretty-print a complex selector. *)
187
188 val pp_selector_list : Format.formatter -> selector_list -> unit
189 (** Pretty-print a selector list. *)
190
191 val pp : Format.formatter -> selector -> unit
192 (** Pretty-print a selector. *)
193end
194
195(** Token types for the selector lexer. *)
196module Token : sig
197 type t = Selector_token.t
198end
199
200(** {1 Functions} *)
201
202val parse : string -> Ast.selector
203(** Parse a CSS selector string.
204
205 @raise Selector_error if the selector is malformed.
206*)
207
208val query : Dom.node -> string -> Dom.node list
209(** Query the DOM tree with a CSS selector.
210
211 Returns all nodes matching the selector in document order.
212
213 @raise Selector_error if the selector is malformed.
214
215 {[
216 let divs = query root_node "div.content > p"
217 ]}
218*)
219
220val matches : Dom.node -> string -> bool
221(** Check if a node matches a CSS selector.
222
223 @raise Selector_error if the selector is malformed.
224
225 {[
226 if matches node ".active" then
227 (* node has class "active" *)
228 ]}
229*)