OCaml HTML5 parser/serialiser based on Python's JustHTML
1open Bytesrw 2 3(* CSS selector query example *) 4 5let html = {| 6<!DOCTYPE html> 7<html> 8<head><title>Products</title></head> 9<body> 10 <div class="container"> 11 <h1 id="title">Product List</h1> 12 <ul class="products"> 13 <li class="product" data-id="1"> 14 <span class="name">Widget A</span> 15 <span class="price">$10.00</span> 16 </li> 17 <li class="product" data-id="2"> 18 <span class="name">Widget B</span> 19 <span class="price">$15.00</span> 20 </li> 21 <li class="product featured" data-id="3"> 22 <span class="name">Widget C</span> 23 <span class="price">$20.00</span> 24 </li> 25 </ul> 26 </div> 27</body> 28</html> 29|} 30 31let () = 32 let result = Html5rw.parse (Bytes.Reader.of_string html) in 33 34 (* Find element by ID *) 35 Printf.printf "=== ID Selector (#title) ===\n"; 36 let titles = Html5rw.query result "#title" in 37 List.iter (fun node -> 38 Printf.printf "Found: %s\n" (Html5rw.get_text_content node) 39 ) titles; 40 41 (* Find elements by class *) 42 Printf.printf "\n=== Class Selector (.product) ===\n"; 43 let products = Html5rw.query result ".product" in 44 Printf.printf "Found %d products\n" (List.length products); 45 46 (* Find elements by tag *) 47 Printf.printf "\n=== Tag Selector (span) ===\n"; 48 let spans = Html5rw.query result "span" in 49 Printf.printf "Found %d span elements\n" (List.length spans); 50 51 (* Find with attribute presence *) 52 Printf.printf "\n=== Attribute Presence ([data-id]) ===\n"; 53 let with_data_id = Html5rw.query result "[data-id]" in 54 List.iter (fun node -> 55 match Html5rw.get_attr node "data-id" with 56 | Some id -> Printf.printf "Found element with data-id=%s\n" id 57 | None -> () 58 ) with_data_id; 59 60 (* Find with attribute value *) 61 Printf.printf "\n=== Attribute Value ([data-id=\"3\"]) ===\n"; 62 let featured = Html5rw.query result "[data-id=\"3\"]" in 63 List.iter (fun node -> 64 Printf.printf "Found: %s\n" (Html5rw.get_text_content node) 65 ) featured; 66 67 (* Find with multiple classes *) 68 Printf.printf "\n=== Multiple Classes (.product.featured) ===\n"; 69 let featured_products = Html5rw.query result ".featured" in 70 List.iter (fun node -> 71 Printf.printf "Featured: %s\n" (Html5rw.get_text_content node) 72 ) featured_products; 73 74 (* Check if a node matches a selector *) 75 Printf.printf "\n=== Match Check (.featured) ===\n"; 76 List.iter (fun node -> 77 if Html5rw.matches node ".featured" then 78 Printf.printf "This product is featured!\n" 79 ) products; 80 81 (* Pseudo-class: first-child *) 82 Printf.printf "\n=== Pseudo-class (:first-child) ===\n"; 83 let first = Html5rw.query result "li:first-child" in 84 List.iter (fun node -> 85 Printf.printf "First li: %s\n" (String.trim (Html5rw.get_text_content node)) 86 ) first; 87 88 (* Pseudo-class: last-child *) 89 Printf.printf "\n=== Pseudo-class (:last-child) ===\n"; 90 let last = Html5rw.query result "li:last-child" in 91 List.iter (fun node -> 92 Printf.printf "Last li: %s\n" (String.trim (Html5rw.get_text_content node)) 93 ) last; 94 95 (* Universal selector *) 96 Printf.printf "\n=== Universal Selector (*) ===\n"; 97 let all = Html5rw.query result "*" in 98 Printf.printf "Total elements: %d\n" (List.length all); 99 100 (* Combining queries: find products then filter *) 101 Printf.printf "\n=== Combined: Products with price > $15 ===\n"; 102 List.iter (fun product -> 103 (* Find price span within this product *) 104 let price_spans = List.filter (fun node -> 105 Html5rw.matches node ".price" 106 ) (Html5rw.descendants product) in 107 List.iter (fun price_span -> 108 let price_text = Html5rw.get_text_content price_span in 109 (* Parse price - remove $ and convert *) 110 let price_str = String.sub price_text 1 (String.length price_text - 1) in 111 let price = float_of_string price_str in 112 if price > 15.0 then begin 113 let name_spans = List.filter (fun node -> 114 Html5rw.matches node ".name" 115 ) (Html5rw.descendants product) in 116 match name_spans with 117 | name :: _ -> 118 Printf.printf " %s: %s\n" (Html5rw.get_text_content name) price_text 119 | [] -> () 120 end 121 ) price_spans 122 ) products