OCaml HTML5 parser/serialiser based on Python's JustHTML
1open Bytesrw
2
3(* Practical web scraping example *)
4
5let sample_page = {|
6<!DOCTYPE html>
7<html lang="en">
8<head>
9 <meta charset="UTF-8">
10 <title>Tech News - Latest Stories</title>
11</head>
12<body>
13 <header>
14 <nav>
15 <a href="/">Home</a>
16 <a href="/news">News</a>
17 <a href="/about">About</a>
18 </nav>
19 </header>
20
21 <main>
22 <article class="story featured">
23 <h2><a href="/story/1">Revolutionary AI Breakthrough</a></h2>
24 <p class="summary">Scientists announce major advancement in machine learning...</p>
25 <span class="author">By Jane Smith</span>
26 <time datetime="2024-01-15">January 15, 2024</time>
27 </article>
28
29 <article class="story">
30 <h2><a href="/story/2">New Programming Language Released</a></h2>
31 <p class="summary">The language promises 10x developer productivity...</p>
32 <span class="author">By John Doe</span>
33 <time datetime="2024-01-14">January 14, 2024</time>
34 </article>
35
36 <article class="story">
37 <h2><a href="/story/3">Open Source Project Reaches Milestone</a></h2>
38 <p class="summary">Community celebrates 1 million downloads...</p>
39 <span class="author">By Alice Chen</span>
40 <time datetime="2024-01-13">January 13, 2024</time>
41 </article>
42 </main>
43
44 <aside>
45 <h3>Popular Tags</h3>
46 <ul class="tags">
47 <li><a href="/tag/ai">AI</a></li>
48 <li><a href="/tag/programming">Programming</a></li>
49 <li><a href="/tag/opensource">Open Source</a></li>
50 </ul>
51 </aside>
52</body>
53</html>
54|}
55
56type story = {
57 title: string;
58 url: string;
59 summary: string;
60 author: string;
61 date: string;
62 featured: bool;
63}
64
65(* Helper to find first child element with given tag name *)
66let find_child_by_tag parent tag =
67 List.find_opt (fun n ->
68 Html5rw.is_element n && String.lowercase_ascii n.Html5rw.Dom.name = tag
69 ) parent.Html5rw.Dom.children
70
71(* Helper to find first descendant element with given tag name *)
72let rec find_descendant_by_tag node tag =
73 let children = List.filter Html5rw.is_element node.Html5rw.Dom.children in
74 match List.find_opt (fun n -> String.lowercase_ascii n.Html5rw.Dom.name = tag) children with
75 | Some found -> Some found
76 | None ->
77 List.find_map (fun child -> find_descendant_by_tag child tag) children
78
79(* Helper to find first descendant with given class *)
80let rec find_by_class node cls =
81 let children = List.filter Html5rw.is_element node.Html5rw.Dom.children in
82 let has_class n =
83 match Html5rw.get_attr n "class" with
84 | Some classes -> List.mem cls (String.split_on_char ' ' classes)
85 | None -> false
86 in
87 match List.find_opt has_class children with
88 | Some found -> Some found
89 | None ->
90 List.find_map (fun child -> find_by_class child cls) children
91
92let extract_story article =
93 (* Find h2 > a for title and URL *)
94 let title, url =
95 match find_descendant_by_tag article "h2" with
96 | Some h2 ->
97 (match find_child_by_tag h2 "a" with
98 | Some a ->
99 (Html5rw.get_text_content a,
100 Option.value ~default:"#" (Html5rw.get_attr a "href"))
101 | None -> (Html5rw.get_text_content h2, "#"))
102 | None -> ("(no title)", "#")
103 in
104 let summary =
105 match find_by_class article "summary" with
106 | Some p -> Html5rw.get_text_content p
107 | None -> ""
108 in
109 let author =
110 match find_by_class article "author" with
111 | Some s -> Html5rw.get_text_content s
112 | None -> "Unknown"
113 in
114 let date =
115 match find_descendant_by_tag article "time" with
116 | Some t -> Option.value ~default:"" (Html5rw.get_attr t "datetime")
117 | None -> ""
118 in
119 let featured = Html5rw.matches article ".featured" in
120 { title; url; summary; author; date; featured }
121
122let () =
123 Printf.printf "=== Web Scraping Example ===\n\n";
124
125 let result = Html5rw.parse (Bytes.Reader.of_string sample_page) in
126
127 (* Extract page title *)
128 let titles = Html5rw.query result "title" in
129 (match titles with
130 | t :: _ -> Printf.printf "Page title: %s\n\n" (Html5rw.get_text_content t)
131 | [] -> ());
132
133 (* Extract navigation links using descendant query *)
134 Printf.printf "Navigation:\n";
135 let nav_links = Html5rw.query result "a" in
136 let nav = List.filter (fun a ->
137 (* Check if this link is in nav by looking at ancestors *)
138 List.exists (fun n -> n.Html5rw.Dom.name = "nav") (Html5rw.ancestors a)
139 ) nav_links in
140 List.iter (fun a ->
141 let text = Html5rw.get_text_content a in
142 let href = Option.value ~default:"#" (Html5rw.get_attr a "href") in
143 Printf.printf " %s -> %s\n" text href
144 ) nav;
145
146 (* Extract stories *)
147 Printf.printf "\nStories:\n";
148 let articles = Html5rw.query result "article" in
149 List.iter (fun article ->
150 let story = extract_story article in
151 Printf.printf "\n %s%s\n"
152 (if story.featured then "[FEATURED] " else "")
153 story.title;
154 Printf.printf " URL: %s\n" story.url;
155 Printf.printf " Summary: %s\n" story.summary;
156 Printf.printf " %s | %s\n" story.author story.date
157 ) articles;
158
159 (* Extract tags *)
160 Printf.printf "\nPopular Tags:\n";
161 let all_links = Html5rw.query result "a" in
162 let tag_links = List.filter (fun a ->
163 let href = Option.value ~default:"" (Html5rw.get_attr a "href") in
164 String.length href > 5 && String.sub href 0 5 = "/tag/"
165 ) all_links in
166 List.iter (fun a ->
167 let tag = Html5rw.get_text_content a in
168 let href = Option.value ~default:"#" (Html5rw.get_attr a "href") in
169 Printf.printf " #%s (%s)\n" tag href
170 ) tag_links