OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** CSS Selector Engine
7
8 This module provides CSS selector parsing and matching for querying
9 the HTML5 DOM. It supports a subset of CSS3 selectors suitable for
10 common web scraping and DOM manipulation tasks.
11
12 {2 Supported Selectors}
13
14 {3 Simple Selectors}
15 - Tag: [div], [p], [span]
16 - ID: [#myid]
17 - Class: [.myclass]
18 - Universal: [*]
19
20 {3 Attribute Selectors}
21 - Presence: [[attr]]
22 - Exact match: [[attr="value"]]
23 - Contains word: [[attr~="value"]]
24 - Starts with: [[attr^="value"]]
25 - Ends with: [[attr$="value"]]
26 - Contains: [[attr*="value"]]
27 - Hyphen-separated: [[attr|="value"]]
28
29 {3 Pseudo-classes}
30 - [:first-child], [:last-child]
31 - [:nth-child(n)], [:nth-last-child(n)]
32 - [:only-child]
33 - [:empty]
34 - [:not(selector)]
35
36 {3 Combinators}
37 - Descendant: [div p] (p anywhere inside div)
38 - Child: [div > p] (p direct child of div)
39 - Adjacent sibling: [div + p] (p immediately after div)
40 - General sibling: [div ~ p] (p after div, same parent)
41
42 {2 Usage}
43
44 {[
45 let doc = Html5rw.parse reader in
46
47 (* Find all paragraphs *)
48 let paragraphs = Html5rw.query doc "p" in
49
50 (* Find links with specific class *)
51 let links = Html5rw.query doc "a.external" in
52
53 (* Find table cells in rows *)
54 let cells = Html5rw.query doc "tr > td" in
55
56 (* Check if a node matches *)
57 let is_active = Html5rw.matches node ".active"
58 ]}
59*)
60
61(** {1 Error Types} *)
62
63(** CSS selector error codes. *)
64module Error_code = Selector_error_code
65
66(** {1 Exceptions} *)
67
68(** Raised when a selector string is malformed.
69
70 The exception contains a typed error code describing the parse error.
71*)
72exception Selector_error = Selector_lexer.Selector_error
73
74(** {1 Sub-modules} *)
75
76(** Abstract syntax tree for parsed selectors. *)
77module Ast = Selector_ast
78
79(** Token types for the selector lexer. *)
80module Token = Selector_token
81
82(** {1 Functions} *)
83
84(** Parse a CSS selector string.
85
86 @raise Selector_error if the selector is malformed.
87*)
88let parse = Selector_parser.parse_selector
89
90(** Query the DOM tree with a CSS selector.
91
92 Returns all nodes matching the selector in document order.
93
94 @raise Selector_error if the selector is malformed.
95
96 {[
97 let divs = query root_node "div.content > p"
98 ]}
99*)
100let query = Selector_match.query
101
102(** Check if a node matches a CSS selector.
103
104 @raise Selector_error if the selector is malformed.
105
106 {[
107 if matches node ".active" then
108 (* node has class "active" *)
109 ]}
110*)
111let matches = Selector_match.matches