Tech News - Latest Stories

open Bytesrw (* Practical web scraping example *) let sample_page = {| Tech News - Latest Stories

Revolutionary AI Breakthrough

Scientists announce major advancement in machine learning...

By Jane Smith January 15, 2024

New Programming Language Released

The language promises 10x developer productivity...

By John Doe January 14, 2024

Open Source Project Reaches Milestone

Community celebrates 1 million downloads...

By Alice Chen January 13, 2024

|} type story = { title: string; url: string; summary: string; author: string; date: string; featured: bool; } (* Helper to find first child element with given tag name *) let find_child_by_tag parent tag = List.find_opt (fun n -> Html5rw.is_element n && String.lowercase_ascii n.Html5rw.Dom.name = tag ) parent.Html5rw.Dom.children (* Helper to find first descendant element with given tag name *) let rec find_descendant_by_tag node tag = let children = List.filter Html5rw.is_element node.Html5rw.Dom.children in match List.find_opt (fun n -> String.lowercase_ascii n.Html5rw.Dom.name = tag) children with | Some found -> Some found | None -> List.find_map (fun child -> find_descendant_by_tag child tag) children (* Helper to find first descendant with given class *) let rec find_by_class node cls = let children = List.filter Html5rw.is_element node.Html5rw.Dom.children in let has_class n = match Html5rw.get_attr n "class" with | Some classes -> List.mem cls (String.split_on_char ' ' classes) | None -> false in match List.find_opt has_class children with | Some found -> Some found | None -> List.find_map (fun child -> find_by_class child cls) children let extract_story article = (* Find h2 > a for title and URL *) let title, url = match find_descendant_by_tag article "h2" with | Some h2 -> (match find_child_by_tag h2 "a" with | Some a -> (Html5rw.get_text_content a, Option.value ~default:"#" (Html5rw.get_attr a "href")) | None -> (Html5rw.get_text_content h2, "#")) | None -> ("(no title)", "#") in let summary = match find_by_class article "summary" with | Some p -> Html5rw.get_text_content p | None -> "" in let author = match find_by_class article "author" with | Some s -> Html5rw.get_text_content s | None -> "Unknown" in let date = match find_descendant_by_tag article "time" with | Some t -> Option.value ~default:"" (Html5rw.get_attr t "datetime") | None -> "" in let featured = Html5rw.matches article ".featured" in { title; url; summary; author; date; featured } let () = Printf.printf "=== Web Scraping Example ===\n\n"; let result = Html5rw.parse (Bytes.Reader.of_string sample_page) in (* Extract page title *) let titles = Html5rw.query result "title" in (match titles with | t :: _ -> Printf.printf "Page title: %s\n\n" (Html5rw.get_text_content t) | [] -> ()); (* Extract navigation links using descendant query *) Printf.printf "Navigation:\n"; let nav_links = Html5rw.query result "a" in let nav = List.filter (fun a -> (* Check if this link is in nav by looking at ancestors *) List.exists (fun n -> n.Html5rw.Dom.name = "nav") (Html5rw.ancestors a) ) nav_links in List.iter (fun a -> let text = Html5rw.get_text_content a in let href = Option.value ~default:"#" (Html5rw.get_attr a "href") in Printf.printf " %s -> %s\n" text href ) nav; (* Extract stories *) Printf.printf "\nStories:\n"; let articles = Html5rw.query result "article" in List.iter (fun article -> let story = extract_story article in Printf.printf "\n %s%s\n" (if story.featured then "[FEATURED] " else "") story.title; Printf.printf " URL: %s\n" story.url; Printf.printf " Summary: %s\n" story.summary; Printf.printf " %s | %s\n" story.author story.date ) articles; (* Extract tags *) Printf.printf "\nPopular Tags:\n"; let all_links = Html5rw.query result "a" in let tag_links = List.filter (fun a -> let href = Option.value ~default:"" (Html5rw.get_attr a "href") in String.length href > 5 && String.sub href 0 5 = "/tag/" ) all_links in List.iter (fun a -> let tag = Html5rw.get_text_content a in let href = Option.value ~default:"#" (Html5rw.get_attr a "href") in Printf.printf " #%s (%s)\n" tag href ) tag_links