OCaml HTML5 parser/serialiser based on Python's JustHTML

metadata and no more re

+17 -1
.gitignore
··· 1 - _build 1 + # OCaml build artifacts 2 + _build/ 3 + *.install 4 + *.merlin 5 + 6 + # Third-party sources (fetch locally with opam source) 7 + third_party/ 8 + 9 + # Editor and OS files 10 + .DS_Store 11 + *.swp 12 + *~ 13 + .vscode/ 14 + .idea/ 15 + 16 + # Opam local switch 17 + _opam/
+1
.ocamlformat
··· 1 + version=0.28.1
+53
.tangled/workflows/build.yml
··· 1 + when: 2 + - event: ["push", "pull_request"] 3 + branch: ["main"] 4 + 5 + engine: nixery 6 + 7 + dependencies: 8 + nixpkgs: 9 + - shell 10 + - stdenv 11 + - findutils 12 + - binutils 13 + - libunwind 14 + - ncurses 15 + - opam 16 + - git 17 + - gawk 18 + - gnupatch 19 + - gnum4 20 + - gnumake 21 + - gnutar 22 + - gnused 23 + - gnugrep 24 + - diffutils 25 + - gzip 26 + - bzip2 27 + - gcc 28 + - ocaml 29 + - pkg-config 30 + 31 + steps: 32 + - name: opam 33 + command: | 34 + opam init --disable-sandboxing -a -y 35 + - name: repo 36 + command: | 37 + opam repo add aoah https://tangled.org/anil.recoil.org/aoah-opam-repo.git 38 + - name: switch 39 + command: | 40 + opam install . --confirm-level=unsafe-yes --deps-only 41 + - name: build 42 + command: | 43 + opam exec -- dune build 44 + - name: switch-test 45 + command: | 46 + opam install . --confirm-level=unsafe-yes --deps-only --with-test 47 + - name: test 48 + command: | 49 + opam exec -- dune runtest --verbose 50 + - name: doc 51 + command: | 52 + opam install -y odoc 53 + opam exec -- dune build @doc
+59
README.md
··· 1 + # html5rw - Pure OCaml HTML5 Parser 2 + 3 + A pure OCaml HTML5 parser implementing the WHATWG HTML5 parsing specification. This library passes the html5lib-tests suite and provides full support for tokenization, tree construction, encoding detection, and CSS selector queries. 4 + 5 + ## Key Features 6 + 7 + - **WHATWG Compliant**: Implements the full HTML5 parsing algorithm with proper error recovery 8 + - **CSS Selectors**: Query the DOM using standard CSS selector syntax 9 + - **Streaming I/O**: Uses bytesrw for efficient streaming input/output 10 + - **Encoding Detection**: Automatic character encoding detection following the WHATWG algorithm 11 + - **Entity Decoding**: Complete HTML5 named character reference support 12 + 13 + ## Usage 14 + 15 + ```ocaml 16 + open Bytesrw 17 + 18 + (* Parse HTML from a string *) 19 + let html = "<html><body><p>Hello, world!</p></body></html>" 20 + let reader = Bytes.Reader.of_string html 21 + let doc = Html5rw.parse reader 22 + 23 + (* Query with CSS selectors *) 24 + let paragraphs = Html5rw.query doc "p" 25 + 26 + (* Extract text content *) 27 + let text = Html5rw.to_text doc 28 + 29 + (* Serialize back to HTML *) 30 + let output = Html5rw.to_string doc 31 + ``` 32 + 33 + For fragment parsing (innerHTML): 34 + 35 + ```ocaml 36 + (* Parse as innerHTML of a <div> *) 37 + let ctx = Html5rw.make_fragment_context ~tag_name:"div" () 38 + let reader = Bytes.Reader.of_string "<p>Fragment content</p>" 39 + let doc = Html5rw.parse ~fragment_context:ctx reader 40 + ``` 41 + 42 + ## Installation 43 + 44 + ``` 45 + opam install html5rw 46 + ``` 47 + 48 + ## Documentation 49 + 50 + API documentation is available via: 51 + 52 + ``` 53 + opam install html5rw 54 + odig doc html5rw 55 + ``` 56 + 57 + ## License 58 + 59 + MIT
+13 -8
dune-project
··· 1 - (lang dune 3.0) 1 + (lang dune 3.20) 2 + 2 3 (name html5rw) 3 - (version 0.1.0) 4 4 5 5 (generate_opam_files true) 6 6 7 - (source (github username/html5rw)) 8 7 (license MIT) 9 - (authors "Author") 10 - (maintainers "author@example.com") 8 + (authors "Anil Madhavapeddy <anil@recoil.org>") 9 + (homepage "https://tangled.org/@anil.recoil.org/ocaml-html5rw") 10 + (maintainers "Anil Madhavapeddy <anil@recoil.org>") 11 + (bug_reports "https://tangled.org/@anil.recoil.org/ocaml-html5rw/issues") 12 + (maintenance_intent "(latest)") 11 13 12 14 (package 13 15 (name html5rw) 14 16 (synopsis "Pure OCaml HTML5 parser implementing the WHATWG specification") 15 - (description "A pure OCaml HTML5 parser that passes the html5lib-tests suite. Implements the WHATWG HTML5 parsing specification including tokenization, tree construction, encoding detection, and CSS selector queries.") 17 + (description 18 + "A pure OCaml HTML5 parser that passes the html5lib-tests suite. \ 19 + Implements the WHATWG HTML5 parsing specification including tokenization, \ 20 + tree construction, encoding detection, and CSS selector queries.") 16 21 (depends 17 - (ocaml (>= 4.14.0)) 22 + (ocaml (>= 5.1.0)) 18 23 (bytesrw (>= 0.3.0)) 19 24 (uutf (>= 1.0.0)) 20 - (re (>= 1.10.0)) 25 + (odoc :with-doc) 21 26 (jsont (and :with-test (>= 0.2.0)))))
+8 -10
html5rw.opam
··· 1 1 # This file is generated by dune, edit dune-project instead 2 2 opam-version: "2.0" 3 - version: "0.1.0" 4 3 synopsis: "Pure OCaml HTML5 parser implementing the WHATWG specification" 5 4 description: 6 5 "A pure OCaml HTML5 parser that passes the html5lib-tests suite. Implements the WHATWG HTML5 parsing specification including tokenization, tree construction, encoding detection, and CSS selector queries." 7 - maintainer: ["author@example.com"] 8 - authors: ["Author"] 6 + maintainer: ["Anil Madhavapeddy <anil@recoil.org>"] 7 + authors: ["Anil Madhavapeddy <anil@recoil.org>"] 9 8 license: "MIT" 10 - homepage: "https://github.com/username/html5rw" 11 - bug-reports: "https://github.com/username/html5rw/issues" 9 + homepage: "https://tangled.org/@anil.recoil.org/ocaml-html5rw" 10 + bug-reports: "https://tangled.org/@anil.recoil.org/ocaml-html5rw/issues" 12 11 depends: [ 13 - "dune" {>= "3.0"} 14 - "ocaml" {>= "4.14.0"} 12 + "dune" {>= "3.20"} 13 + "ocaml" {>= "5.1.0"} 15 14 "bytesrw" {>= "0.3.0"} 16 15 "uutf" {>= "1.0.0"} 17 - "re" {>= "1.10.0"} 16 + "odoc" {with-doc} 18 17 "jsont" {with-test & >= "0.2.0"} 19 - "odoc" {with-doc} 20 18 ] 21 19 build: [ 22 20 ["dune" "subst"] {dev} ··· 32 30 "@doc" {with-doc} 33 31 ] 34 32 ] 35 - dev-repo: "git+https://github.com/username/html5rw.git" 33 + x-maintenance-intent: ["(latest)"]
+11 -1
lib/dom/html5rw_dom.ml
··· 1 - (* html5rw.dom - HTML5 DOM types and operations *) 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** HTML5 DOM Types and Operations 7 + 8 + This module provides DOM manipulation functions for HTML5 documents. 9 + It includes node creation, tree traversal, attribute manipulation, 10 + and serialization. 11 + *) 2 12 3 13 include Node 4 14
+5
lib/dom/node.ml
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 1 6 (* HTML5 DOM node types *) 2 7 3 8 type doctype_data = {
+5
lib/dom/node.mli
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 1 6 (** HTML5 DOM Node Types and Operations 2 7 3 8 This module provides the DOM node representation used by the HTML5 parser.
+5
lib/dom/serialize.ml
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 1 6 (* HTML5 DOM serialization *) 2 7 3 8 open Bytesrw
+89 -7
lib/encoding/html5rw_encoding.ml
··· 1 - (* html5rw.encoding - HTML5 encoding detection and decoding *) 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** HTML5 Encoding Detection and Decoding 7 + 8 + This module implements the WHATWG encoding sniffing and decoding 9 + algorithms for HTML5 documents. It handles automatic character 10 + encoding detection from byte order marks (BOM), meta charset 11 + declarations, and transport layer hints. 12 + 13 + {2 Encoding Detection Algorithm} 14 + 15 + The encoding detection follows the WHATWG specification: 16 + 1. Check for a BOM (UTF-8, UTF-16LE, UTF-16BE) 17 + 2. Prescan for [<meta charset>] or [<meta http-equiv="content-type">] 18 + 3. Use transport layer encoding hint if provided 19 + 4. Fall back to UTF-8 as the default 20 + 21 + @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding> 22 + WHATWG encoding sniffing algorithm 23 + *) 2 24 25 + (** {1 Types} *) 26 + 27 + (** Character encodings supported by the parser. 28 + 29 + The HTML5 specification requires support for a large number of 30 + encodings, but this implementation focuses on the most common ones. 31 + Other encodings are mapped to their closest equivalent. 32 + *) 3 33 type encoding = Encoding.t = 4 - | Utf8 5 - | Utf16le 6 - | Utf16be 7 - | Windows_1252 8 - | Iso_8859_2 9 - | Euc_jp 34 + | Utf8 (** UTF-8 encoding (default) *) 35 + | Utf16le (** UTF-16 little-endian *) 36 + | Utf16be (** UTF-16 big-endian *) 37 + | Windows_1252 (** Windows-1252 (Latin-1 superset) *) 38 + | Iso_8859_2 (** ISO-8859-2 (Central European) *) 39 + | Euc_jp (** EUC-JP (Japanese) *) 40 + 41 + (** {1 Encoding Utilities} *) 42 + 43 + (** Convert an encoding to its canonical label string. 10 44 45 + Returns the WHATWG canonical name, e.g., ["utf-8"], ["utf-16le"]. 46 + *) 11 47 let encoding_to_string = Encoding.to_string 12 48 49 + (** Detect encoding from a byte order mark. 50 + 51 + Examines the first bytes of the input for a BOM and returns the 52 + detected encoding with the number of bytes to skip. 53 + 54 + @return [(Some (encoding, skip_bytes))] if a BOM is found, 55 + [None] otherwise. 56 + *) 13 57 let sniff_bom = Bom.sniff 14 58 59 + (** Normalize an encoding label to its canonical form. 60 + 61 + Maps encoding labels (case-insensitive, with optional whitespace) 62 + to the supported encoding types. 63 + 64 + @return [Some encoding] if the label is recognized, [None] otherwise. 65 + 66 + {[ 67 + normalize_label "UTF-8" (* Some Utf8 *) 68 + normalize_label "utf8" (* Some Utf8 *) 69 + normalize_label "latin1" (* Some Windows_1252 *) 70 + ]} 71 + *) 15 72 let normalize_label = Labels.normalize_label 16 73 74 + (** Prescan bytes to find a meta charset declaration. 75 + 76 + Implements the WHATWG prescan algorithm that looks for encoding 77 + declarations in the first 1024 bytes of an HTML document. 78 + 79 + @return [Some encoding] if a meta charset is found, [None] otherwise. 80 + *) 17 81 let prescan_for_meta_charset = Prescan.prescan_for_meta_charset 18 82 83 + (** {1 Decoding} *) 84 + 85 + (** Decode raw bytes to a UTF-8 string with automatic encoding detection. 86 + 87 + This function implements the full encoding sniffing algorithm: 88 + 1. Check for BOM 89 + 2. Prescan for meta charset 90 + 3. Use transport encoding hint if provided 91 + 4. Fall back to UTF-8 92 + 93 + @param transport_encoding Encoding hint from HTTP Content-Type header 94 + @return [(decoded_string, detected_encoding)] 95 + 96 + {[ 97 + let (html, enc) = decode raw_bytes () 98 + (* html is now a UTF-8 string, enc is the detected encoding *) 99 + ]} 100 + *) 19 101 let decode = Decode.decode
+90 -1
lib/entities/html5rw_entities.ml
··· 1 - (* html5rw.entities - HTML5 entity decoding *) 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** HTML5 Named Character Reference Decoding 7 + 8 + This module provides functions for decoding HTML5 named character 9 + references (entities) and numeric character references. It includes 10 + the complete table of 2,231 named character references defined in 11 + the WHATWG HTML5 specification. 12 + 13 + {2 Character Reference Types} 14 + 15 + HTML5 supports three types of character references: 16 + 17 + {3 Named References} 18 + - Standard form: [&amp;], [&lt;], [&gt;], [&nbsp;] 19 + - Some entities have multiple codepoint outputs: [&NotNestedLessLess;] 20 + 21 + {3 Decimal Numeric References} 22 + - Form: [&#123;] (decimal codepoint) 23 + 24 + {3 Hexadecimal Numeric References} 25 + - Form: [&#x7B;] or [&#X7B;] (hexadecimal codepoint) 26 + 27 + {2 Legacy Entity Handling} 28 + 29 + Some named entities are "legacy" - they were supported without a 30 + trailing semicolon in older browsers (e.g., [&amp] instead of [&amp;]). 31 + The parser handles these according to the WHATWG specification. 32 + 33 + @see <https://html.spec.whatwg.org/multipage/named-characters.html> 34 + The complete list of named character references 35 + *) 36 + 37 + (** {1 Decoding Functions} *) 38 + 39 + (** Decode all character references in a text string. 40 + 41 + Processes the string and replaces all valid character references 42 + (named and numeric) with their decoded UTF-8 equivalents. 43 + 44 + {[ 45 + decode "Hello &amp; goodbye" 46 + (* Returns: "Hello & goodbye" *) 2 47 48 + decode "&#60;script&#62;" 49 + (* Returns: "<script>" *) 50 + ]} 51 + *) 3 52 let decode = Decode.decode_entities_in_text 4 53 54 + (** Decode a numeric character reference. 55 + 56 + @param codepoint The Unicode codepoint to decode 57 + @return The UTF-8 string representation 58 + 59 + Note: Some codepoints are replaced according to the HTML5 60 + specification (e.g., control characters in the 0x80-0x9F range 61 + are mapped to Windows-1252 equivalents). 62 + *) 5 63 let decode_numeric = Numeric_ref.decode 6 64 65 + (** Look up a named character reference. 66 + 67 + @param name The entity name without [&] and [;] (e.g., ["amp"]) 68 + @return [Some codepoints] if the entity exists, [None] otherwise 69 + 70 + {[ 71 + lookup "amp" (* Some [0x26] *) 72 + lookup "nbsp" (* Some [0xA0] *) 73 + lookup "bogus" (* None *) 74 + ]} 75 + *) 7 76 let lookup = Entity_table.lookup 8 77 78 + (** Check if an entity is a legacy entity. 79 + 80 + Legacy entities are those that were historically recognized without 81 + a trailing semicolon. The parser handles these specially to maintain 82 + browser compatibility. 83 + 84 + {[ 85 + is_legacy "amp" (* true - &amp works without ; *) 86 + is_legacy "nbsp" (* true *) 87 + is_legacy "Aacute" (* false - requires semicolon *) 88 + ]} 89 + *) 9 90 let is_legacy = Entity_table.is_legacy 10 91 92 + (** Convert a Unicode codepoint to its UTF-8 encoding. 93 + 94 + @param codepoint The Unicode codepoint (0 to 0x10FFFF) 95 + @return The UTF-8 encoded string 96 + *) 11 97 let codepoint_to_utf8 = Numeric_ref.codepoint_to_utf8 12 98 99 + (** {1 Sub-modules} *) 100 + 101 + (** Numeric character reference handling. *) 13 102 module Numeric_ref = Numeric_ref
+5
lib/html5rw/html5rw.ml
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 1 6 (** Html5rw - Pure OCaml HTML5 Parser 2 7 3 8 This module provides a complete HTML5 parsing solution following the
+5
lib/html5rw/html5rw.mli
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 1 6 (** Html5rw - Pure OCaml HTML5 Parser 2 7 3 8 This module provides a complete HTML5 parsing solution following the
+5
lib/parser/html5rw_parser.ml
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 1 6 (* html5rw.parser - HTML5 parser with bytesrw-only API *) 2 7 3 8 module Dom = Html5rw_dom
+6 -1
lib/parser/html5rw_parser.mli
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 1 6 (** HTML5 Parser 2 7 3 8 This module provides the core HTML5 parsing functionality implementing ··· 172 177 173 178 @raise Html5rw_selector.Selector_error if the selector is invalid 174 179 175 - @see {!Html5rw_selector} for supported selector syntax 180 + See {!Html5rw_selector} for supported selector syntax. 176 181 *) 177 182 178 183 (** {1 Serialization} *)
+5
lib/parser/parser.ml
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 1 6 (* Main parser entry point - bytesrw-only API *) 2 7 3 8 open Bytesrw
+1 -1
lib/selector/dune
··· 1 1 (library 2 2 (name html5rw_selector) 3 3 (public_name html5rw.selector) 4 - (libraries html5rw.dom re)) 4 + (libraries html5rw.dom))
+95 -1
lib/selector/html5rw_selector.ml
··· 1 - (* html5rw.selector - CSS selector engine *) 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 2 5 6 + (** CSS Selector Engine 7 + 8 + This module provides CSS selector parsing and matching for querying 9 + the HTML5 DOM. It supports a subset of CSS3 selectors suitable for 10 + common web scraping and DOM manipulation tasks. 11 + 12 + {2 Supported Selectors} 13 + 14 + {3 Simple Selectors} 15 + - Tag: [div], [p], [span] 16 + - ID: [#myid] 17 + - Class: [.myclass] 18 + - Universal: [*] 19 + 20 + {3 Attribute Selectors} 21 + - Presence: [[attr]] 22 + - Exact match: [[attr="value"]] 23 + - Contains word: [[attr~="value"]] 24 + - Starts with: [[attr^="value"]] 25 + - Ends with: [[attr$="value"]] 26 + - Contains: [[attr*="value"]] 27 + - Hyphen-separated: [[attr|="value"]] 28 + 29 + {3 Pseudo-classes} 30 + - [:first-child], [:last-child] 31 + - [:nth-child(n)], [:nth-last-child(n)] 32 + - [:only-child] 33 + - [:empty] 34 + - [:not(selector)] 35 + 36 + {3 Combinators} 37 + - Descendant: [div p] (p anywhere inside div) 38 + - Child: [div > p] (p direct child of div) 39 + - Adjacent sibling: [div + p] (p immediately after div) 40 + - General sibling: [div ~ p] (p after div, same parent) 41 + 42 + {2 Usage} 43 + 44 + {[ 45 + let doc = Html5rw.parse reader in 46 + 47 + (* Find all paragraphs *) 48 + let paragraphs = Html5rw.query doc "p" in 49 + 50 + (* Find links with specific class *) 51 + let links = Html5rw.query doc "a.external" in 52 + 53 + (* Find table cells in rows *) 54 + let cells = Html5rw.query doc "tr > td" in 55 + 56 + (* Check if a node matches *) 57 + let is_active = Html5rw.matches node ".active" 58 + ]} 59 + *) 60 + 61 + (** {1 Exceptions} *) 62 + 63 + (** Raised when a selector string is malformed. 64 + 65 + The exception contains an error message describing the parse error. 66 + *) 3 67 exception Selector_error = Selector_lexer.Selector_error 4 68 69 + (** {1 Sub-modules} *) 70 + 71 + (** Abstract syntax tree for parsed selectors. *) 5 72 module Ast = Selector_ast 73 + 74 + (** Token types for the selector lexer. *) 6 75 module Token = Selector_token 7 76 77 + (** {1 Functions} *) 78 + 79 + (** Parse a CSS selector string. 80 + 81 + @raise Selector_error if the selector is malformed. 82 + *) 8 83 let parse = Selector_parser.parse_selector 9 84 85 + (** Query the DOM tree with a CSS selector. 86 + 87 + Returns all nodes matching the selector in document order. 88 + 89 + @raise Selector_error if the selector is malformed. 90 + 91 + {[ 92 + let divs = query root_node "div.content > p" 93 + ]} 94 + *) 10 95 let query = Selector_match.query 11 96 97 + (** Check if a node matches a CSS selector. 98 + 99 + @raise Selector_error if the selector is malformed. 100 + 101 + {[ 102 + if matches node ".active" then 103 + (* node has class "active" *) 104 + ]} 105 + *) 12 106 let matches = Selector_match.matches
+15 -1
lib/selector/selector_match.ml
··· 3 3 module Dom = Html5rw_dom 4 4 open Selector_ast 5 5 6 + (* Check if haystack contains needle as a substring *) 7 + let string_contains ~haystack ~needle = 8 + let needle_len = String.length needle in 9 + let haystack_len = String.length haystack in 10 + if needle_len > haystack_len then false 11 + else if needle_len = 0 then true 12 + else 13 + let rec check i = 14 + if i > haystack_len - needle_len then false 15 + else if String.sub haystack i needle_len = needle then true 16 + else check (i + 1) 17 + in 18 + check 0 19 + 6 20 let is_element node = 7 21 let name = node.Dom.name in 8 22 name <> "#text" && name <> "#comment" && name <> "#document" && ··· 177 191 String.sub attr_value 0 (String.length value) = value 178 192 | Some "$=" -> value <> "" && String.length attr_value >= String.length value && 179 193 String.sub attr_value (String.length attr_value - String.length value) (String.length value) = value 180 - | Some "*=" -> value <> "" && Re.execp (Re.compile (Re.str value)) attr_value 194 + | Some "*=" -> value <> "" && string_contains ~haystack:attr_value ~needle:value 181 195 | Some _ | None -> false)) 182 196 | None -> false) 183 197 | Type_pseudo ->
+3
test/dune
··· 128 128 (executable (name template_debug3) (libraries bytesrw html5rw.parser html5rw.dom)) 129 129 (executable (name template_debug4) (libraries bytesrw html5rw.parser html5rw.dom)) 130 130 (executable (name ns_sens_test) (libraries bytesrw html5rw.parser html5rw.dom)) 131 + (executable (name debug2) (libraries bytesrw html5rw.parser html5rw.dom)) 132 + (executable (name debug_title) (libraries bytesrw html5rw.parser html5rw.dom)) 133 + (executable (name nobr_debug2) (libraries bytesrw html5rw.parser html5rw.dom))
+18 -12
test/nobr_debug2.ml
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (* Test nobr element handling in SVG fragment context *) 7 + 8 + open Bytesrw 9 + 1 10 module Parser = Html5rw_parser 2 11 module Dom = Html5rw_dom 3 12 4 13 let rec print_tree indent node = 5 - Printf.printf "%s%s (ns=%s, %d children)\n" 6 - indent 7 - node.Dom.name 14 + Printf.printf "%s%s (ns=%s, %d children)\n" 15 + indent 16 + node.Dom.name 8 17 (match node.Dom.namespace with Some s -> s | None -> "html") 9 18 (List.length node.Dom.children); 10 19 List.iter (print_tree (indent ^ " ")) node.Dom.children ··· 12 21 let () = 13 22 let input = "<nobr>X" in 14 23 print_endline "Starting..."; 15 - let context = { Parser.Tree_builder.tag_name = "path"; namespace = Some "svg" } in 16 - 17 - (* Create parser state directly for inspection *) 18 - let t = Parser.Tree_builder.create ~collect_errors:true ~fragment_context:context input in 19 - print_endline "\nInitial tree structure:"; 20 - print_tree "" t.Parser.Tree_builder.document; 21 - print_endline "\nInitial stack size:"; 22 - Printf.printf "%d elements\n" (List.length t.Parser.Tree_builder.open_elements); 23 - List.iter (fun n -> Printf.printf " - %s\n" n.Dom.name) t.Parser.Tree_builder.open_elements 24 + let context = Parser.make_fragment_context ~tag_name:"path" ~namespace:(Some "svg") () in 25 + let result = Parser.parse ~collect_errors:true ~fragment_context:context (Bytes.Reader.of_string input) in 26 + print_endline "\nFinal tree structure:"; 27 + print_tree "" (Parser.root result); 28 + print_endline "\nTest format:"; 29 + print_endline (Dom.to_test_format (Parser.root result))