OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 7.0 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6(** CSS Selector Engine 7 8 This module provides CSS selector parsing and matching for querying 9 the HTML5 DOM. It supports a subset of CSS3 selectors suitable for 10 common web scraping and DOM manipulation tasks. 11 12 {2 Supported Selectors} 13 14 {3 Simple Selectors} 15 - Tag: [div], [p], [span] 16 - ID: [#myid] 17 - Class: [.myclass] 18 - Universal: [*] 19 20 {3 Attribute Selectors} 21 - Presence: [[attr]] 22 - Exact match: [[attr="value"]] 23 - Contains word: [[attr~="value"]] 24 - Starts with: [[attr^="value"]] 25 - Ends with: [[attr$="value"]] 26 - Contains: [[attr*="value"]] 27 - Hyphen-separated: [[attr|="value"]] 28 29 {3 Pseudo-classes} 30 - [:first-child], [:last-child] 31 - [:nth-child(n)], [:nth-last-child(n)] 32 - [:only-child] 33 - [:empty] 34 - [:not(selector)] 35 36 {3 Combinators} 37 - Descendant: [div p] (p anywhere inside div) 38 - Child: [div > p] (p direct child of div) 39 - Adjacent sibling: [div + p] (p immediately after div) 40 - General sibling: [div ~ p] (p after div, same parent) 41 42 {2 Usage} 43 44 {[ 45 let doc = Html5rw.parse reader in 46 47 (* Find all paragraphs *) 48 let paragraphs = Html5rw.query doc "p" in 49 50 (* Find links with specific class *) 51 let links = Html5rw.query doc "a.external" in 52 53 (* Find table cells in rows *) 54 let cells = Html5rw.query doc "tr > td" in 55 56 (* Check if a node matches *) 57 let is_active = Html5rw.matches node ".active" 58 ]} 59*) 60 61(** {1 Error Types} *) 62 63(** CSS selector error codes. 64 65 This module provides the {!Error_code.t} variant type that represents 66 all possible errors when parsing CSS selectors. 67*) 68module Error_code : sig 69 type t = 70 | Empty_selector 71 (** The selector string was empty or contained only whitespace. *) 72 | Unterminated_string 73 (** A quoted string was not closed before end of input. *) 74 | Unterminated_escape 75 (** An escape sequence was not completed before end of input. *) 76 | Expected_identifier_after_hash 77 (** Expected an identifier after [#] for ID selector. *) 78 | Expected_identifier_after_dot 79 (** Expected an identifier after [.] for class selector. *) 80 | Expected_attribute_name 81 (** Expected an attribute name inside an attribute selector. *) 82 | Expected_closing_bracket 83 (** Expected [\]] to close an attribute selector. *) 84 | Expected_equals_after_operator of char 85 (** Expected [=] after an attribute operator like [~], [|], [^], [$], or [*]. *) 86 | Unexpected_character_in_attribute_selector 87 (** Found an unexpected character inside an attribute selector. *) 88 | Expected_pseudo_class_name 89 (** Expected a pseudo-class name after [:]. *) 90 | Expected_closing_paren 91 (** Expected [)] to close a pseudo-class argument. *) 92 | Unexpected_character of char 93 (** Found an unexpected character in the selector. *) 94 | Expected_attribute_value 95 (** Expected a value after the attribute operator. *) 96 | Expected_closing_bracket_or_operator 97 (** Expected [\]] or an attribute operator like [=]. *) 98 | Expected_selector_after_combinator 99 (** Expected a selector after a combinator ([>], [+], [~], or space). *) 100 | Unexpected_token 101 (** Found an unexpected token in the selector. *) 102 | Expected_end_of_selector 103 (** Expected end of selector but found more tokens. *) 104 105 val to_string : t -> string 106 (** Convert to a kebab-case string identifier suitable for programmatic use. *) 107 108 val to_human_string : t -> string 109 (** Convert to a human-readable error message. *) 110 111 val pp : Format.formatter -> t -> unit 112 (** Pretty-print a selector error code. *) 113end 114 115(** {1 Exceptions} *) 116 117exception Selector_error of Error_code.t 118(** Raised when a selector string is malformed. 119 120 The exception contains a typed error code describing the parse error. 121 Use {!Error_code.to_string} or {!Error_code.to_human_string} to get 122 a string representation. 123*) 124 125(** {1 Sub-modules} *) 126 127(** Abstract syntax tree for parsed selectors. *) 128module Ast : sig 129 type simple_selector_type = Selector_ast.simple_selector_type = 130 | Type_tag 131 | Type_id 132 | Type_class 133 | Type_universal 134 | Type_attr 135 | Type_pseudo 136 137 type simple_selector = Selector_ast.simple_selector = { 138 selector_type : simple_selector_type; 139 name : string option; 140 operator : string option; 141 value : string option; 142 arg : string option; 143 } 144 145 type compound_selector = Selector_ast.compound_selector = { 146 selectors : simple_selector list; 147 } 148 149 type complex_selector = Selector_ast.complex_selector = { 150 parts : (string option * compound_selector) list; 151 } 152 153 type selector_list = Selector_ast.selector_list = { 154 selectors : complex_selector list; 155 } 156 157 type selector = Selector_ast.selector = 158 | Simple of simple_selector 159 | Compound of compound_selector 160 | Complex of complex_selector 161 | List of selector_list 162 163 val make_simple : 164 simple_selector_type -> 165 ?name:string -> 166 ?operator:string -> 167 ?value:string -> 168 ?arg:string -> 169 unit -> 170 simple_selector 171 172 val make_compound : simple_selector list -> compound_selector 173 val make_complex : (string option * compound_selector) list -> complex_selector 174 val make_list : complex_selector list -> selector_list 175 176 val pp_simple_selector_type : Format.formatter -> simple_selector_type -> unit 177 (** Pretty-print a simple selector type. *) 178 179 val pp_simple_selector : Format.formatter -> simple_selector -> unit 180 (** Pretty-print a simple selector. *) 181 182 val pp_compound_selector : Format.formatter -> compound_selector -> unit 183 (** Pretty-print a compound selector. *) 184 185 val pp_complex_selector : Format.formatter -> complex_selector -> unit 186 (** Pretty-print a complex selector. *) 187 188 val pp_selector_list : Format.formatter -> selector_list -> unit 189 (** Pretty-print a selector list. *) 190 191 val pp : Format.formatter -> selector -> unit 192 (** Pretty-print a selector. *) 193end 194 195(** Token types for the selector lexer. *) 196module Token : sig 197 type t = Selector_token.t 198end 199 200(** {1 Functions} *) 201 202val parse : string -> Ast.selector 203(** Parse a CSS selector string. 204 205 @raise Selector_error if the selector is malformed. 206*) 207 208val query : Dom.node -> string -> Dom.node list 209(** Query the DOM tree with a CSS selector. 210 211 Returns all nodes matching the selector in document order. 212 213 @raise Selector_error if the selector is malformed. 214 215 {[ 216 let divs = query root_node "div.content > p" 217 ]} 218*) 219 220val matches : Dom.node -> string -> bool 221(** Check if a node matches a CSS selector. 222 223 @raise Selector_error if the selector is malformed. 224 225 {[ 226 if matches node ".active" then 227 (* node has class "active" *) 228 ]} 229*)