+17
-1
.gitignore
+17
-1
.gitignore
+1
.ocamlformat
+1
.ocamlformat
···
1
+
version=0.28.1
+53
.tangled/workflows/build.yml
+53
.tangled/workflows/build.yml
···
1
+
when:
2
+
- event: ["push", "pull_request"]
3
+
branch: ["main"]
4
+
5
+
engine: nixery
6
+
7
+
dependencies:
8
+
nixpkgs:
9
+
- shell
10
+
- stdenv
11
+
- findutils
12
+
- binutils
13
+
- libunwind
14
+
- ncurses
15
+
- opam
16
+
- git
17
+
- gawk
18
+
- gnupatch
19
+
- gnum4
20
+
- gnumake
21
+
- gnutar
22
+
- gnused
23
+
- gnugrep
24
+
- diffutils
25
+
- gzip
26
+
- bzip2
27
+
- gcc
28
+
- ocaml
29
+
- pkg-config
30
+
31
+
steps:
32
+
- name: opam
33
+
command: |
34
+
opam init --disable-sandboxing -a -y
35
+
- name: repo
36
+
command: |
37
+
opam repo add aoah https://tangled.org/anil.recoil.org/aoah-opam-repo.git
38
+
- name: switch
39
+
command: |
40
+
opam install . --confirm-level=unsafe-yes --deps-only
41
+
- name: build
42
+
command: |
43
+
opam exec -- dune build
44
+
- name: switch-test
45
+
command: |
46
+
opam install . --confirm-level=unsafe-yes --deps-only --with-test
47
+
- name: test
48
+
command: |
49
+
opam exec -- dune runtest --verbose
50
+
- name: doc
51
+
command: |
52
+
opam install -y odoc
53
+
opam exec -- dune build @doc
+59
README.md
+59
README.md
···
1
+
# html5rw - Pure OCaml HTML5 Parser
2
+
3
+
A pure OCaml HTML5 parser implementing the WHATWG HTML5 parsing specification. This library passes the html5lib-tests suite and provides full support for tokenization, tree construction, encoding detection, and CSS selector queries.
4
+
5
+
## Key Features
6
+
7
+
- **WHATWG Compliant**: Implements the full HTML5 parsing algorithm with proper error recovery
8
+
- **CSS Selectors**: Query the DOM using standard CSS selector syntax
9
+
- **Streaming I/O**: Uses bytesrw for efficient streaming input/output
10
+
- **Encoding Detection**: Automatic character encoding detection following the WHATWG algorithm
11
+
- **Entity Decoding**: Complete HTML5 named character reference support
12
+
13
+
## Usage
14
+
15
+
```ocaml
16
+
open Bytesrw
17
+
18
+
(* Parse HTML from a string *)
19
+
let html = "<html><body><p>Hello, world!</p></body></html>"
20
+
let reader = Bytes.Reader.of_string html
21
+
let doc = Html5rw.parse reader
22
+
23
+
(* Query with CSS selectors *)
24
+
let paragraphs = Html5rw.query doc "p"
25
+
26
+
(* Extract text content *)
27
+
let text = Html5rw.to_text doc
28
+
29
+
(* Serialize back to HTML *)
30
+
let output = Html5rw.to_string doc
31
+
```
32
+
33
+
For fragment parsing (innerHTML):
34
+
35
+
```ocaml
36
+
(* Parse as innerHTML of a <div> *)
37
+
let ctx = Html5rw.make_fragment_context ~tag_name:"div" ()
38
+
let reader = Bytes.Reader.of_string "<p>Fragment content</p>"
39
+
let doc = Html5rw.parse ~fragment_context:ctx reader
40
+
```
41
+
42
+
## Installation
43
+
44
+
```
45
+
opam install html5rw
46
+
```
47
+
48
+
## Documentation
49
+
50
+
API documentation is available via:
51
+
52
+
```
53
+
opam install html5rw
54
+
odig doc html5rw
55
+
```
56
+
57
+
## License
58
+
59
+
MIT
+13
-8
dune-project
+13
-8
dune-project
···
1
-
(lang dune 3.0)
1
+
(lang dune 3.20)
2
+
2
3
(name html5rw)
3
-
(version 0.1.0)
4
4
5
5
(generate_opam_files true)
6
6
7
-
(source (github username/html5rw))
8
7
(license MIT)
9
-
(authors "Author")
10
-
(maintainers "author@example.com")
8
+
(authors "Anil Madhavapeddy <anil@recoil.org>")
9
+
(homepage "https://tangled.org/@anil.recoil.org/ocaml-html5rw")
10
+
(maintainers "Anil Madhavapeddy <anil@recoil.org>")
11
+
(bug_reports "https://tangled.org/@anil.recoil.org/ocaml-html5rw/issues")
12
+
(maintenance_intent "(latest)")
11
13
12
14
(package
13
15
(name html5rw)
14
16
(synopsis "Pure OCaml HTML5 parser implementing the WHATWG specification")
15
-
(description "A pure OCaml HTML5 parser that passes the html5lib-tests suite. Implements the WHATWG HTML5 parsing specification including tokenization, tree construction, encoding detection, and CSS selector queries.")
17
+
(description
18
+
"A pure OCaml HTML5 parser that passes the html5lib-tests suite. \
19
+
Implements the WHATWG HTML5 parsing specification including tokenization, \
20
+
tree construction, encoding detection, and CSS selector queries.")
16
21
(depends
17
-
(ocaml (>= 4.14.0))
22
+
(ocaml (>= 5.1.0))
18
23
(bytesrw (>= 0.3.0))
19
24
(uutf (>= 1.0.0))
20
-
(re (>= 1.10.0))
25
+
(odoc :with-doc)
21
26
(jsont (and :with-test (>= 0.2.0)))))
+8
-10
html5rw.opam
+8
-10
html5rw.opam
···
1
1
# This file is generated by dune, edit dune-project instead
2
2
opam-version: "2.0"
3
-
version: "0.1.0"
4
3
synopsis: "Pure OCaml HTML5 parser implementing the WHATWG specification"
5
4
description:
6
5
"A pure OCaml HTML5 parser that passes the html5lib-tests suite. Implements the WHATWG HTML5 parsing specification including tokenization, tree construction, encoding detection, and CSS selector queries."
7
-
maintainer: ["author@example.com"]
8
-
authors: ["Author"]
6
+
maintainer: ["Anil Madhavapeddy <anil@recoil.org>"]
7
+
authors: ["Anil Madhavapeddy <anil@recoil.org>"]
9
8
license: "MIT"
10
-
homepage: "https://github.com/username/html5rw"
11
-
bug-reports: "https://github.com/username/html5rw/issues"
9
+
homepage: "https://tangled.org/@anil.recoil.org/ocaml-html5rw"
10
+
bug-reports: "https://tangled.org/@anil.recoil.org/ocaml-html5rw/issues"
12
11
depends: [
13
-
"dune" {>= "3.0"}
14
-
"ocaml" {>= "4.14.0"}
12
+
"dune" {>= "3.20"}
13
+
"ocaml" {>= "5.1.0"}
15
14
"bytesrw" {>= "0.3.0"}
16
15
"uutf" {>= "1.0.0"}
17
-
"re" {>= "1.10.0"}
16
+
"odoc" {with-doc}
18
17
"jsont" {with-test & >= "0.2.0"}
19
-
"odoc" {with-doc}
20
18
]
21
19
build: [
22
20
["dune" "subst"] {dev}
···
32
30
"@doc" {with-doc}
33
31
]
34
32
]
35
-
dev-repo: "git+https://github.com/username/html5rw.git"
33
+
x-maintenance-intent: ["(latest)"]
+11
-1
lib/dom/html5rw_dom.ml
+11
-1
lib/dom/html5rw_dom.ml
···
1
-
(* html5rw.dom - HTML5 DOM types and operations *)
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** HTML5 DOM Types and Operations
7
+
8
+
This module provides DOM manipulation functions for HTML5 documents.
9
+
It includes node creation, tree traversal, attribute manipulation,
10
+
and serialization.
11
+
*)
2
12
3
13
include Node
4
14
+5
lib/dom/node.ml
+5
lib/dom/node.ml
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
1
6
(* HTML5 DOM node types *)
2
7
3
8
type doctype_data = {
+5
lib/dom/node.mli
+5
lib/dom/node.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
1
6
(** HTML5 DOM Node Types and Operations
2
7
3
8
This module provides the DOM node representation used by the HTML5 parser.
+5
lib/dom/serialize.ml
+5
lib/dom/serialize.ml
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
1
6
(* HTML5 DOM serialization *)
2
7
3
8
open Bytesrw
+89
-7
lib/encoding/html5rw_encoding.ml
+89
-7
lib/encoding/html5rw_encoding.ml
···
1
-
(* html5rw.encoding - HTML5 encoding detection and decoding *)
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** HTML5 Encoding Detection and Decoding
7
+
8
+
This module implements the WHATWG encoding sniffing and decoding
9
+
algorithms for HTML5 documents. It handles automatic character
10
+
encoding detection from byte order marks (BOM), meta charset
11
+
declarations, and transport layer hints.
12
+
13
+
{2 Encoding Detection Algorithm}
14
+
15
+
The encoding detection follows the WHATWG specification:
16
+
1. Check for a BOM (UTF-8, UTF-16LE, UTF-16BE)
17
+
2. Prescan for [<meta charset>] or [<meta http-equiv="content-type">]
18
+
3. Use transport layer encoding hint if provided
19
+
4. Fall back to UTF-8 as the default
20
+
21
+
@see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
22
+
WHATWG encoding sniffing algorithm
23
+
*)
2
24
25
+
(** {1 Types} *)
26
+
27
+
(** Character encodings supported by the parser.
28
+
29
+
The HTML5 specification requires support for a large number of
30
+
encodings, but this implementation focuses on the most common ones.
31
+
Other encodings are mapped to their closest equivalent.
32
+
*)
3
33
type encoding = Encoding.t =
4
-
| Utf8
5
-
| Utf16le
6
-
| Utf16be
7
-
| Windows_1252
8
-
| Iso_8859_2
9
-
| Euc_jp
34
+
| Utf8 (** UTF-8 encoding (default) *)
35
+
| Utf16le (** UTF-16 little-endian *)
36
+
| Utf16be (** UTF-16 big-endian *)
37
+
| Windows_1252 (** Windows-1252 (Latin-1 superset) *)
38
+
| Iso_8859_2 (** ISO-8859-2 (Central European) *)
39
+
| Euc_jp (** EUC-JP (Japanese) *)
40
+
41
+
(** {1 Encoding Utilities} *)
42
+
43
+
(** Convert an encoding to its canonical label string.
10
44
45
+
Returns the WHATWG canonical name, e.g., ["utf-8"], ["utf-16le"].
46
+
*)
11
47
let encoding_to_string = Encoding.to_string
12
48
49
+
(** Detect encoding from a byte order mark.
50
+
51
+
Examines the first bytes of the input for a BOM and returns the
52
+
detected encoding with the number of bytes to skip.
53
+
54
+
@return [(Some (encoding, skip_bytes))] if a BOM is found,
55
+
[None] otherwise.
56
+
*)
13
57
let sniff_bom = Bom.sniff
14
58
59
+
(** Normalize an encoding label to its canonical form.
60
+
61
+
Maps encoding labels (case-insensitive, with optional whitespace)
62
+
to the supported encoding types.
63
+
64
+
@return [Some encoding] if the label is recognized, [None] otherwise.
65
+
66
+
{[
67
+
normalize_label "UTF-8" (* Some Utf8 *)
68
+
normalize_label "utf8" (* Some Utf8 *)
69
+
normalize_label "latin1" (* Some Windows_1252 *)
70
+
]}
71
+
*)
15
72
let normalize_label = Labels.normalize_label
16
73
74
+
(** Prescan bytes to find a meta charset declaration.
75
+
76
+
Implements the WHATWG prescan algorithm that looks for encoding
77
+
declarations in the first 1024 bytes of an HTML document.
78
+
79
+
@return [Some encoding] if a meta charset is found, [None] otherwise.
80
+
*)
17
81
let prescan_for_meta_charset = Prescan.prescan_for_meta_charset
18
82
83
+
(** {1 Decoding} *)
84
+
85
+
(** Decode raw bytes to a UTF-8 string with automatic encoding detection.
86
+
87
+
This function implements the full encoding sniffing algorithm:
88
+
1. Check for BOM
89
+
2. Prescan for meta charset
90
+
3. Use transport encoding hint if provided
91
+
4. Fall back to UTF-8
92
+
93
+
@param transport_encoding Encoding hint from HTTP Content-Type header
94
+
@return [(decoded_string, detected_encoding)]
95
+
96
+
{[
97
+
let (html, enc) = decode raw_bytes ()
98
+
(* html is now a UTF-8 string, enc is the detected encoding *)
99
+
]}
100
+
*)
19
101
let decode = Decode.decode
+90
-1
lib/entities/html5rw_entities.ml
+90
-1
lib/entities/html5rw_entities.ml
···
1
-
(* html5rw.entities - HTML5 entity decoding *)
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** HTML5 Named Character Reference Decoding
7
+
8
+
This module provides functions for decoding HTML5 named character
9
+
references (entities) and numeric character references. It includes
10
+
the complete table of 2,231 named character references defined in
11
+
the WHATWG HTML5 specification.
12
+
13
+
{2 Character Reference Types}
14
+
15
+
HTML5 supports three types of character references:
16
+
17
+
{3 Named References}
18
+
- Standard form: [&], [<], [>], [ ]
19
+
- Some entities have multiple codepoint outputs: [⪡̸]
20
+
21
+
{3 Decimal Numeric References}
22
+
- Form: [{] (decimal codepoint)
23
+
24
+
{3 Hexadecimal Numeric References}
25
+
- Form: [{] or [{] (hexadecimal codepoint)
26
+
27
+
{2 Legacy Entity Handling}
28
+
29
+
Some named entities are "legacy" - they were supported without a
30
+
trailing semicolon in older browsers (e.g., [&] instead of [&]).
31
+
The parser handles these according to the WHATWG specification.
32
+
33
+
@see <https://html.spec.whatwg.org/multipage/named-characters.html>
34
+
The complete list of named character references
35
+
*)
36
+
37
+
(** {1 Decoding Functions} *)
38
+
39
+
(** Decode all character references in a text string.
40
+
41
+
Processes the string and replaces all valid character references
42
+
(named and numeric) with their decoded UTF-8 equivalents.
43
+
44
+
{[
45
+
decode "Hello & goodbye"
46
+
(* Returns: "Hello & goodbye" *)
2
47
48
+
decode "<script>"
49
+
(* Returns: "<script>" *)
50
+
]}
51
+
*)
3
52
let decode = Decode.decode_entities_in_text
4
53
54
+
(** Decode a numeric character reference.
55
+
56
+
@param codepoint The Unicode codepoint to decode
57
+
@return The UTF-8 string representation
58
+
59
+
Note: Some codepoints are replaced according to the HTML5
60
+
specification (e.g., control characters in the 0x80-0x9F range
61
+
are mapped to Windows-1252 equivalents).
62
+
*)
5
63
let decode_numeric = Numeric_ref.decode
6
64
65
+
(** Look up a named character reference.
66
+
67
+
@param name The entity name without [&] and [;] (e.g., ["amp"])
68
+
@return [Some codepoints] if the entity exists, [None] otherwise
69
+
70
+
{[
71
+
lookup "amp" (* Some [0x26] *)
72
+
lookup "nbsp" (* Some [0xA0] *)
73
+
lookup "bogus" (* None *)
74
+
]}
75
+
*)
7
76
let lookup = Entity_table.lookup
8
77
78
+
(** Check if an entity is a legacy entity.
79
+
80
+
Legacy entities are those that were historically recognized without
81
+
a trailing semicolon. The parser handles these specially to maintain
82
+
browser compatibility.
83
+
84
+
{[
85
+
is_legacy "amp" (* true - & works without ; *)
86
+
is_legacy "nbsp" (* true *)
87
+
is_legacy "Aacute" (* false - requires semicolon *)
88
+
]}
89
+
*)
9
90
let is_legacy = Entity_table.is_legacy
10
91
92
+
(** Convert a Unicode codepoint to its UTF-8 encoding.
93
+
94
+
@param codepoint The Unicode codepoint (0 to 0x10FFFF)
95
+
@return The UTF-8 encoded string
96
+
*)
11
97
let codepoint_to_utf8 = Numeric_ref.codepoint_to_utf8
12
98
99
+
(** {1 Sub-modules} *)
100
+
101
+
(** Numeric character reference handling. *)
13
102
module Numeric_ref = Numeric_ref
+5
lib/html5rw/html5rw.ml
+5
lib/html5rw/html5rw.ml
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
1
6
(** Html5rw - Pure OCaml HTML5 Parser
2
7
3
8
This module provides a complete HTML5 parsing solution following the
+5
lib/html5rw/html5rw.mli
+5
lib/html5rw/html5rw.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
1
6
(** Html5rw - Pure OCaml HTML5 Parser
2
7
3
8
This module provides a complete HTML5 parsing solution following the
+5
lib/parser/html5rw_parser.ml
+5
lib/parser/html5rw_parser.ml
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
1
6
(* html5rw.parser - HTML5 parser with bytesrw-only API *)
2
7
3
8
module Dom = Html5rw_dom
+6
-1
lib/parser/html5rw_parser.mli
+6
-1
lib/parser/html5rw_parser.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
1
6
(** HTML5 Parser
2
7
3
8
This module provides the core HTML5 parsing functionality implementing
···
172
177
173
178
@raise Html5rw_selector.Selector_error if the selector is invalid
174
179
175
-
@see {!Html5rw_selector} for supported selector syntax
180
+
See {!Html5rw_selector} for supported selector syntax.
176
181
*)
177
182
178
183
(** {1 Serialization} *)
+5
lib/parser/parser.ml
+5
lib/parser/parser.ml
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
1
6
(* Main parser entry point - bytesrw-only API *)
2
7
3
8
open Bytesrw
+1
-1
lib/selector/dune
+1
-1
lib/selector/dune
+95
-1
lib/selector/html5rw_selector.ml
+95
-1
lib/selector/html5rw_selector.ml
···
1
-
(* html5rw.selector - CSS selector engine *)
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
2
5
6
+
(** CSS Selector Engine
7
+
8
+
This module provides CSS selector parsing and matching for querying
9
+
the HTML5 DOM. It supports a subset of CSS3 selectors suitable for
10
+
common web scraping and DOM manipulation tasks.
11
+
12
+
{2 Supported Selectors}
13
+
14
+
{3 Simple Selectors}
15
+
- Tag: [div], [p], [span]
16
+
- ID: [#myid]
17
+
- Class: [.myclass]
18
+
- Universal: [*]
19
+
20
+
{3 Attribute Selectors}
21
+
- Presence: [[attr]]
22
+
- Exact match: [[attr="value"]]
23
+
- Contains word: [[attr~="value"]]
24
+
- Starts with: [[attr^="value"]]
25
+
- Ends with: [[attr$="value"]]
26
+
- Contains: [[attr*="value"]]
27
+
- Hyphen-separated: [[attr|="value"]]
28
+
29
+
{3 Pseudo-classes}
30
+
- [:first-child], [:last-child]
31
+
- [:nth-child(n)], [:nth-last-child(n)]
32
+
- [:only-child]
33
+
- [:empty]
34
+
- [:not(selector)]
35
+
36
+
{3 Combinators}
37
+
- Descendant: [div p] (p anywhere inside div)
38
+
- Child: [div > p] (p direct child of div)
39
+
- Adjacent sibling: [div + p] (p immediately after div)
40
+
- General sibling: [div ~ p] (p after div, same parent)
41
+
42
+
{2 Usage}
43
+
44
+
{[
45
+
let doc = Html5rw.parse reader in
46
+
47
+
(* Find all paragraphs *)
48
+
let paragraphs = Html5rw.query doc "p" in
49
+
50
+
(* Find links with specific class *)
51
+
let links = Html5rw.query doc "a.external" in
52
+
53
+
(* Find table cells in rows *)
54
+
let cells = Html5rw.query doc "tr > td" in
55
+
56
+
(* Check if a node matches *)
57
+
let is_active = Html5rw.matches node ".active"
58
+
]}
59
+
*)
60
+
61
+
(** {1 Exceptions} *)
62
+
63
+
(** Raised when a selector string is malformed.
64
+
65
+
The exception contains an error message describing the parse error.
66
+
*)
3
67
exception Selector_error = Selector_lexer.Selector_error
4
68
69
+
(** {1 Sub-modules} *)
70
+
71
+
(** Abstract syntax tree for parsed selectors. *)
5
72
module Ast = Selector_ast
73
+
74
+
(** Token types for the selector lexer. *)
6
75
module Token = Selector_token
7
76
77
+
(** {1 Functions} *)
78
+
79
+
(** Parse a CSS selector string.
80
+
81
+
@raise Selector_error if the selector is malformed.
82
+
*)
8
83
let parse = Selector_parser.parse_selector
9
84
85
+
(** Query the DOM tree with a CSS selector.
86
+
87
+
Returns all nodes matching the selector in document order.
88
+
89
+
@raise Selector_error if the selector is malformed.
90
+
91
+
{[
92
+
let divs = query root_node "div.content > p"
93
+
]}
94
+
*)
10
95
let query = Selector_match.query
11
96
97
+
(** Check if a node matches a CSS selector.
98
+
99
+
@raise Selector_error if the selector is malformed.
100
+
101
+
{[
102
+
if matches node ".active" then
103
+
(* node has class "active" *)
104
+
]}
105
+
*)
12
106
let matches = Selector_match.matches
+15
-1
lib/selector/selector_match.ml
+15
-1
lib/selector/selector_match.ml
···
3
3
module Dom = Html5rw_dom
4
4
open Selector_ast
5
5
6
+
(* Check if haystack contains needle as a substring *)
7
+
let string_contains ~haystack ~needle =
8
+
let needle_len = String.length needle in
9
+
let haystack_len = String.length haystack in
10
+
if needle_len > haystack_len then false
11
+
else if needle_len = 0 then true
12
+
else
13
+
let rec check i =
14
+
if i > haystack_len - needle_len then false
15
+
else if String.sub haystack i needle_len = needle then true
16
+
else check (i + 1)
17
+
in
18
+
check 0
19
+
6
20
let is_element node =
7
21
let name = node.Dom.name in
8
22
name <> "#text" && name <> "#comment" && name <> "#document" &&
···
177
191
String.sub attr_value 0 (String.length value) = value
178
192
| Some "$=" -> value <> "" && String.length attr_value >= String.length value &&
179
193
String.sub attr_value (String.length attr_value - String.length value) (String.length value) = value
180
-
| Some "*=" -> value <> "" && Re.execp (Re.compile (Re.str value)) attr_value
194
+
| Some "*=" -> value <> "" && string_contains ~haystack:attr_value ~needle:value
181
195
| Some _ | None -> false))
182
196
| None -> false)
183
197
| Type_pseudo ->
+3
test/dune
+3
test/dune
···
128
128
(executable (name template_debug3) (libraries bytesrw html5rw.parser html5rw.dom))
129
129
(executable (name template_debug4) (libraries bytesrw html5rw.parser html5rw.dom))
130
130
(executable (name ns_sens_test) (libraries bytesrw html5rw.parser html5rw.dom))
131
+
(executable (name debug2) (libraries bytesrw html5rw.parser html5rw.dom))
132
+
(executable (name debug_title) (libraries bytesrw html5rw.parser html5rw.dom))
133
+
(executable (name nobr_debug2) (libraries bytesrw html5rw.parser html5rw.dom))
+18
-12
test/nobr_debug2.ml
+18
-12
test/nobr_debug2.ml
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(* Test nobr element handling in SVG fragment context *)
7
+
8
+
open Bytesrw
9
+
1
10
module Parser = Html5rw_parser
2
11
module Dom = Html5rw_dom
3
12
4
13
let rec print_tree indent node =
5
-
Printf.printf "%s%s (ns=%s, %d children)\n"
6
-
indent
7
-
node.Dom.name
14
+
Printf.printf "%s%s (ns=%s, %d children)\n"
15
+
indent
16
+
node.Dom.name
8
17
(match node.Dom.namespace with Some s -> s | None -> "html")
9
18
(List.length node.Dom.children);
10
19
List.iter (print_tree (indent ^ " ")) node.Dom.children
···
12
21
let () =
13
22
let input = "<nobr>X" in
14
23
print_endline "Starting...";
15
-
let context = { Parser.Tree_builder.tag_name = "path"; namespace = Some "svg" } in
16
-
17
-
(* Create parser state directly for inspection *)
18
-
let t = Parser.Tree_builder.create ~collect_errors:true ~fragment_context:context input in
19
-
print_endline "\nInitial tree structure:";
20
-
print_tree "" t.Parser.Tree_builder.document;
21
-
print_endline "\nInitial stack size:";
22
-
Printf.printf "%d elements\n" (List.length t.Parser.Tree_builder.open_elements);
23
-
List.iter (fun n -> Printf.printf " - %s\n" n.Dom.name) t.Parser.Tree_builder.open_elements
24
+
let context = Parser.make_fragment_context ~tag_name:"path" ~namespace:(Some "svg") () in
25
+
let result = Parser.parse ~collect_errors:true ~fragment_context:context (Bytes.Reader.of_string input) in
26
+
print_endline "\nFinal tree structure:";
27
+
print_tree "" (Parser.root result);
28
+
print_endline "\nTest format:";
29
+
print_endline (Dom.to_test_format (Parser.root result))