OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 2.9 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6(** CSS Selector Engine 7 8 This module provides CSS selector parsing and matching for querying 9 the HTML5 DOM. It supports a subset of CSS3 selectors suitable for 10 common web scraping and DOM manipulation tasks. 11 12 {2 Supported Selectors} 13 14 {3 Simple Selectors} 15 - Tag: [div], [p], [span] 16 - ID: [#myid] 17 - Class: [.myclass] 18 - Universal: [*] 19 20 {3 Attribute Selectors} 21 - Presence: [[attr]] 22 - Exact match: [[attr="value"]] 23 - Contains word: [[attr~="value"]] 24 - Starts with: [[attr^="value"]] 25 - Ends with: [[attr$="value"]] 26 - Contains: [[attr*="value"]] 27 - Hyphen-separated: [[attr|="value"]] 28 29 {3 Pseudo-classes} 30 - [:first-child], [:last-child] 31 - [:nth-child(n)], [:nth-last-child(n)] 32 - [:only-child] 33 - [:empty] 34 - [:not(selector)] 35 36 {3 Combinators} 37 - Descendant: [div p] (p anywhere inside div) 38 - Child: [div > p] (p direct child of div) 39 - Adjacent sibling: [div + p] (p immediately after div) 40 - General sibling: [div ~ p] (p after div, same parent) 41 42 {2 Usage} 43 44 {[ 45 let doc = Html5rw.parse reader in 46 47 (* Find all paragraphs *) 48 let paragraphs = Html5rw.query doc "p" in 49 50 (* Find links with specific class *) 51 let links = Html5rw.query doc "a.external" in 52 53 (* Find table cells in rows *) 54 let cells = Html5rw.query doc "tr > td" in 55 56 (* Check if a node matches *) 57 let is_active = Html5rw.matches node ".active" 58 ]} 59*) 60 61(** {1 Error Types} *) 62 63(** CSS selector error codes. *) 64module Error_code = Selector_error_code 65 66(** {1 Exceptions} *) 67 68(** Raised when a selector string is malformed. 69 70 The exception contains a typed error code describing the parse error. 71*) 72exception Selector_error = Selector_lexer.Selector_error 73 74(** {1 Sub-modules} *) 75 76(** Abstract syntax tree for parsed selectors. *) 77module Ast = Selector_ast 78 79(** Token types for the selector lexer. *) 80module Token = Selector_token 81 82(** {1 Functions} *) 83 84(** Parse a CSS selector string. 85 86 @raise Selector_error if the selector is malformed. 87*) 88let parse = Selector_parser.parse_selector 89 90(** Query the DOM tree with a CSS selector. 91 92 Returns all nodes matching the selector in document order. 93 94 @raise Selector_error if the selector is malformed. 95 96 {[ 97 let divs = query root_node "div.content > p" 98 ]} 99*) 100let query = Selector_match.query 101 102(** Check if a node matches a CSS selector. 103 104 @raise Selector_error if the selector is malformed. 105 106 {[ 107 if matches node ".active" then 108 (* node has class "active" *) 109 ]} 110*) 111let matches = Selector_match.matches