OCaml HTML5 parser/serialiser based on Python's JustHTML
1open Bytesrw
2
3(* CSS selector query example *)
4
5let html = {|
6<!DOCTYPE html>
7<html>
8<head><title>Products</title></head>
9<body>
10 <div class="container">
11 <h1 id="title">Product List</h1>
12 <ul class="products">
13 <li class="product" data-id="1">
14 <span class="name">Widget A</span>
15 <span class="price">$10.00</span>
16 </li>
17 <li class="product" data-id="2">
18 <span class="name">Widget B</span>
19 <span class="price">$15.00</span>
20 </li>
21 <li class="product featured" data-id="3">
22 <span class="name">Widget C</span>
23 <span class="price">$20.00</span>
24 </li>
25 </ul>
26 </div>
27</body>
28</html>
29|}
30
31let () =
32 let result = Html5rw.parse (Bytes.Reader.of_string html) in
33
34 (* Find element by ID *)
35 Printf.printf "=== ID Selector (#title) ===\n";
36 let titles = Html5rw.query result "#title" in
37 List.iter (fun node ->
38 Printf.printf "Found: %s\n" (Html5rw.get_text_content node)
39 ) titles;
40
41 (* Find elements by class *)
42 Printf.printf "\n=== Class Selector (.product) ===\n";
43 let products = Html5rw.query result ".product" in
44 Printf.printf "Found %d products\n" (List.length products);
45
46 (* Find elements by tag *)
47 Printf.printf "\n=== Tag Selector (span) ===\n";
48 let spans = Html5rw.query result "span" in
49 Printf.printf "Found %d span elements\n" (List.length spans);
50
51 (* Find with attribute presence *)
52 Printf.printf "\n=== Attribute Presence ([data-id]) ===\n";
53 let with_data_id = Html5rw.query result "[data-id]" in
54 List.iter (fun node ->
55 match Html5rw.get_attr node "data-id" with
56 | Some id -> Printf.printf "Found element with data-id=%s\n" id
57 | None -> ()
58 ) with_data_id;
59
60 (* Find with attribute value *)
61 Printf.printf "\n=== Attribute Value ([data-id=\"3\"]) ===\n";
62 let featured = Html5rw.query result "[data-id=\"3\"]" in
63 List.iter (fun node ->
64 Printf.printf "Found: %s\n" (Html5rw.get_text_content node)
65 ) featured;
66
67 (* Find with multiple classes *)
68 Printf.printf "\n=== Multiple Classes (.product.featured) ===\n";
69 let featured_products = Html5rw.query result ".featured" in
70 List.iter (fun node ->
71 Printf.printf "Featured: %s\n" (Html5rw.get_text_content node)
72 ) featured_products;
73
74 (* Check if a node matches a selector *)
75 Printf.printf "\n=== Match Check (.featured) ===\n";
76 List.iter (fun node ->
77 if Html5rw.matches node ".featured" then
78 Printf.printf "This product is featured!\n"
79 ) products;
80
81 (* Pseudo-class: first-child *)
82 Printf.printf "\n=== Pseudo-class (:first-child) ===\n";
83 let first = Html5rw.query result "li:first-child" in
84 List.iter (fun node ->
85 Printf.printf "First li: %s\n" (String.trim (Html5rw.get_text_content node))
86 ) first;
87
88 (* Pseudo-class: last-child *)
89 Printf.printf "\n=== Pseudo-class (:last-child) ===\n";
90 let last = Html5rw.query result "li:last-child" in
91 List.iter (fun node ->
92 Printf.printf "Last li: %s\n" (String.trim (Html5rw.get_text_content node))
93 ) last;
94
95 (* Universal selector *)
96 Printf.printf "\n=== Universal Selector (*) ===\n";
97 let all = Html5rw.query result "*" in
98 Printf.printf "Total elements: %d\n" (List.length all);
99
100 (* Combining queries: find products then filter *)
101 Printf.printf "\n=== Combined: Products with price > $15 ===\n";
102 List.iter (fun product ->
103 (* Find price span within this product *)
104 let price_spans = List.filter (fun node ->
105 Html5rw.matches node ".price"
106 ) (Html5rw.descendants product) in
107 List.iter (fun price_span ->
108 let price_text = Html5rw.get_text_content price_span in
109 (* Parse price - remove $ and convert *)
110 let price_str = String.sub price_text 1 (String.length price_text - 1) in
111 let price = float_of_string price_str in
112 if price > 15.0 then begin
113 let name_spans = List.filter (fun node ->
114 Html5rw.matches node ".name"
115 ) (Html5rw.descendants product) in
116 match name_spans with
117 | name :: _ ->
118 Printf.printf " %s: %s\n" (Html5rw.get_text_content name) price_text
119 | [] -> ()
120 end
121 ) price_spans
122 ) products