OCaml HTML5 parser/serialiser based on Python's JustHTML
1open Bytesrw 2 3(* Practical web scraping example *) 4 5let sample_page = {| 6<!DOCTYPE html> 7<html lang="en"> 8<head> 9 <meta charset="UTF-8"> 10 <title>Tech News - Latest Stories</title> 11</head> 12<body> 13 <header> 14 <nav> 15 <a href="/">Home</a> 16 <a href="/news">News</a> 17 <a href="/about">About</a> 18 </nav> 19 </header> 20 21 <main> 22 <article class="story featured"> 23 <h2><a href="/story/1">Revolutionary AI Breakthrough</a></h2> 24 <p class="summary">Scientists announce major advancement in machine learning...</p> 25 <span class="author">By Jane Smith</span> 26 <time datetime="2024-01-15">January 15, 2024</time> 27 </article> 28 29 <article class="story"> 30 <h2><a href="/story/2">New Programming Language Released</a></h2> 31 <p class="summary">The language promises 10x developer productivity...</p> 32 <span class="author">By John Doe</span> 33 <time datetime="2024-01-14">January 14, 2024</time> 34 </article> 35 36 <article class="story"> 37 <h2><a href="/story/3">Open Source Project Reaches Milestone</a></h2> 38 <p class="summary">Community celebrates 1 million downloads...</p> 39 <span class="author">By Alice Chen</span> 40 <time datetime="2024-01-13">January 13, 2024</time> 41 </article> 42 </main> 43 44 <aside> 45 <h3>Popular Tags</h3> 46 <ul class="tags"> 47 <li><a href="/tag/ai">AI</a></li> 48 <li><a href="/tag/programming">Programming</a></li> 49 <li><a href="/tag/opensource">Open Source</a></li> 50 </ul> 51 </aside> 52</body> 53</html> 54|} 55 56type story = { 57 title: string; 58 url: string; 59 summary: string; 60 author: string; 61 date: string; 62 featured: bool; 63} 64 65(* Helper to find first child element with given tag name *) 66let find_child_by_tag parent tag = 67 List.find_opt (fun n -> 68 Html5rw.is_element n && String.lowercase_ascii n.Html5rw.Dom.name = tag 69 ) parent.Html5rw.Dom.children 70 71(* Helper to find first descendant element with given tag name *) 72let rec find_descendant_by_tag node tag = 73 let children = List.filter Html5rw.is_element node.Html5rw.Dom.children in 74 match List.find_opt (fun n -> String.lowercase_ascii n.Html5rw.Dom.name = tag) children with 75 | Some found -> Some found 76 | None -> 77 List.find_map (fun child -> find_descendant_by_tag child tag) children 78 79(* Helper to find first descendant with given class *) 80let rec find_by_class node cls = 81 let children = List.filter Html5rw.is_element node.Html5rw.Dom.children in 82 let has_class n = 83 match Html5rw.get_attr n "class" with 84 | Some classes -> List.mem cls (String.split_on_char ' ' classes) 85 | None -> false 86 in 87 match List.find_opt has_class children with 88 | Some found -> Some found 89 | None -> 90 List.find_map (fun child -> find_by_class child cls) children 91 92let extract_story article = 93 (* Find h2 > a for title and URL *) 94 let title, url = 95 match find_descendant_by_tag article "h2" with 96 | Some h2 -> 97 (match find_child_by_tag h2 "a" with 98 | Some a -> 99 (Html5rw.get_text_content a, 100 Option.value ~default:"#" (Html5rw.get_attr a "href")) 101 | None -> (Html5rw.get_text_content h2, "#")) 102 | None -> ("(no title)", "#") 103 in 104 let summary = 105 match find_by_class article "summary" with 106 | Some p -> Html5rw.get_text_content p 107 | None -> "" 108 in 109 let author = 110 match find_by_class article "author" with 111 | Some s -> Html5rw.get_text_content s 112 | None -> "Unknown" 113 in 114 let date = 115 match find_descendant_by_tag article "time" with 116 | Some t -> Option.value ~default:"" (Html5rw.get_attr t "datetime") 117 | None -> "" 118 in 119 let featured = Html5rw.matches article ".featured" in 120 { title; url; summary; author; date; featured } 121 122let () = 123 Printf.printf "=== Web Scraping Example ===\n\n"; 124 125 let result = Html5rw.parse (Bytes.Reader.of_string sample_page) in 126 127 (* Extract page title *) 128 let titles = Html5rw.query result "title" in 129 (match titles with 130 | t :: _ -> Printf.printf "Page title: %s\n\n" (Html5rw.get_text_content t) 131 | [] -> ()); 132 133 (* Extract navigation links using descendant query *) 134 Printf.printf "Navigation:\n"; 135 let nav_links = Html5rw.query result "a" in 136 let nav = List.filter (fun a -> 137 (* Check if this link is in nav by looking at ancestors *) 138 List.exists (fun n -> n.Html5rw.Dom.name = "nav") (Html5rw.ancestors a) 139 ) nav_links in 140 List.iter (fun a -> 141 let text = Html5rw.get_text_content a in 142 let href = Option.value ~default:"#" (Html5rw.get_attr a "href") in 143 Printf.printf " %s -> %s\n" text href 144 ) nav; 145 146 (* Extract stories *) 147 Printf.printf "\nStories:\n"; 148 let articles = Html5rw.query result "article" in 149 List.iter (fun article -> 150 let story = extract_story article in 151 Printf.printf "\n %s%s\n" 152 (if story.featured then "[FEATURED] " else "") 153 story.title; 154 Printf.printf " URL: %s\n" story.url; 155 Printf.printf " Summary: %s\n" story.summary; 156 Printf.printf " %s | %s\n" story.author story.date 157 ) articles; 158 159 (* Extract tags *) 160 Printf.printf "\nPopular Tags:\n"; 161 let all_links = Html5rw.query result "a" in 162 let tag_links = List.filter (fun a -> 163 let href = Option.value ~default:"" (Html5rw.get_attr a "href") in 164 String.length href > 5 && String.sub href 0 5 = "/tag/" 165 ) all_links in 166 List.iter (fun a -> 167 let tag = Html5rw.get_text_content a in 168 let href = Option.value ~default:"#" (Html5rw.get_attr a "href") in 169 Printf.printf " #%s (%s)\n" tag href 170 ) tag_links