examples/web_scraper.ml at 81c4816404ceafd6d88e08303e3870f364dc0a32 · anil.recoil.org/ocaml-html5rw

OCaml HTML5 parser/serialiser based on Python's JustHTML
ocaml-html5rw / examples / web_scraper.ml
at 81c4816404ceafd6d88e08303e3870f364dc0a32 5.5 kB view raw
  1open Bytesrw
  2
  3(* Practical web scraping example *)
  4
  5let sample_page = {|
  6<!DOCTYPE html>
  7<html lang="en">
  8<head>
  9  <meta charset="UTF-8">
 10  <title>Tech News - Latest Stories</title>
 11</head>
 12<body>
 13  <header>
 14    <nav>
 15      <a href="/">Home</a>
 16      <a href="/news">News</a>
 17      <a href="/about">About</a>
 18    </nav>
 19  </header>
 20
 21  <main>
 22    <article class="story featured">
 23      <h2><a href="/story/1">Revolutionary AI Breakthrough</a></h2>
 24      <p class="summary">Scientists announce major advancement in machine learning...</p>
 25      <span class="author">By Jane Smith</span>
 26      <time datetime="2024-01-15">January 15, 2024</time>
 27    </article>
 28
 29    <article class="story">
 30      <h2><a href="/story/2">New Programming Language Released</a></h2>
 31      <p class="summary">The language promises 10x developer productivity...</p>
 32      <span class="author">By John Doe</span>
 33      <time datetime="2024-01-14">January 14, 2024</time>
 34    </article>
 35
 36    <article class="story">
 37      <h2><a href="/story/3">Open Source Project Reaches Milestone</a></h2>
 38      <p class="summary">Community celebrates 1 million downloads...</p>
 39      <span class="author">By Alice Chen</span>
 40      <time datetime="2024-01-13">January 13, 2024</time>
 41    </article>
 42  </main>
 43
 44  <aside>
 45    <h3>Popular Tags</h3>
 46    <ul class="tags">
 47      <li><a href="/tag/ai">AI</a></li>
 48      <li><a href="/tag/programming">Programming</a></li>
 49      <li><a href="/tag/opensource">Open Source</a></li>
 50    </ul>
 51  </aside>
 52</body>
 53</html>
 54|}
 55
 56type story = {
 57  title: string;
 58  url: string;
 59  summary: string;
 60  author: string;
 61  date: string;
 62  featured: bool;
 63}
 64
 65(* Helper to find first child element with given tag name *)
 66let find_child_by_tag parent tag =
 67  List.find_opt (fun n ->
 68    Html5rw.is_element n && String.lowercase_ascii n.Html5rw.Dom.name = tag
 69  ) parent.Html5rw.Dom.children
 70
 71(* Helper to find first descendant element with given tag name *)
 72let rec find_descendant_by_tag node tag =
 73  let children = List.filter Html5rw.is_element node.Html5rw.Dom.children in
 74  match List.find_opt (fun n -> String.lowercase_ascii n.Html5rw.Dom.name = tag) children with
 75  | Some found -> Some found
 76  | None ->
 77    List.find_map (fun child -> find_descendant_by_tag child tag) children
 78
 79(* Helper to find first descendant with given class *)
 80let rec find_by_class node cls =
 81  let children = List.filter Html5rw.is_element node.Html5rw.Dom.children in
 82  let has_class n =
 83    match Html5rw.get_attr n "class" with
 84    | Some classes -> List.mem cls (String.split_on_char ' ' classes)
 85    | None -> false
 86  in
 87  match List.find_opt has_class children with
 88  | Some found -> Some found
 89  | None ->
 90    List.find_map (fun child -> find_by_class child cls) children
 91
 92let extract_story article =
 93  (* Find h2 > a for title and URL *)
 94  let title, url =
 95    match find_descendant_by_tag article "h2" with
 96    | Some h2 ->
 97      (match find_child_by_tag h2 "a" with
 98       | Some a ->
 99         (Html5rw.get_text_content a,
100          Option.value ~default:"#" (Html5rw.get_attr a "href"))
101       | None -> (Html5rw.get_text_content h2, "#"))
102    | None -> ("(no title)", "#")
103  in
104  let summary =
105    match find_by_class article "summary" with
106    | Some p -> Html5rw.get_text_content p
107    | None -> ""
108  in
109  let author =
110    match find_by_class article "author" with
111    | Some s -> Html5rw.get_text_content s
112    | None -> "Unknown"
113  in
114  let date =
115    match find_descendant_by_tag article "time" with
116    | Some t -> Option.value ~default:"" (Html5rw.get_attr t "datetime")
117    | None -> ""
118  in
119  let featured = Html5rw.matches article ".featured" in
120  { title; url; summary; author; date; featured }
121
122let () =
123  Printf.printf "=== Web Scraping Example ===\n\n";
124
125  let result = Html5rw.parse (Bytes.Reader.of_string sample_page) in
126
127  (* Extract page title *)
128  let titles = Html5rw.query result "title" in
129  (match titles with
130   | t :: _ -> Printf.printf "Page title: %s\n\n" (Html5rw.get_text_content t)
131   | [] -> ());
132
133  (* Extract navigation links using descendant query *)
134  Printf.printf "Navigation:\n";
135  let nav_links = Html5rw.query result "a" in
136  let nav = List.filter (fun a ->
137    (* Check if this link is in nav by looking at ancestors *)
138    List.exists (fun n -> n.Html5rw.Dom.name = "nav") (Html5rw.ancestors a)
139  ) nav_links in
140  List.iter (fun a ->
141    let text = Html5rw.get_text_content a in
142    let href = Option.value ~default:"#" (Html5rw.get_attr a "href") in
143    Printf.printf "  %s -> %s\n" text href
144  ) nav;
145
146  (* Extract stories *)
147  Printf.printf "\nStories:\n";
148  let articles = Html5rw.query result "article" in
149  List.iter (fun article ->
150    let story = extract_story article in
151    Printf.printf "\n  %s%s\n"
152      (if story.featured then "[FEATURED] " else "")
153      story.title;
154    Printf.printf "  URL: %s\n" story.url;
155    Printf.printf "  Summary: %s\n" story.summary;
156    Printf.printf "  %s | %s\n" story.author story.date
157  ) articles;
158
159  (* Extract tags *)
160  Printf.printf "\nPopular Tags:\n";
161  let all_links = Html5rw.query result "a" in
162  let tag_links = List.filter (fun a ->
163    let href = Option.value ~default:"" (Html5rw.get_attr a "href") in
164    String.length href > 5 && String.sub href 0 5 = "/tag/"
165  ) all_links in
166  List.iter (fun a ->
167    let tag = Html5rw.get_text_content a in
168    let href = Option.value ~default:"#" (Html5rw.get_attr a "href") in
169    Printf.printf "  #%s (%s)\n" tag href
170  ) tag_links