OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 1.8 kB view raw
1open Bytesrw 2 3(* Text extraction example *) 4 5let html = {| 6<!DOCTYPE html> 7<html> 8<head> 9 <title>Article</title> 10 <style>body { font-family: sans-serif; }</style> 11 <script>console.log("Hello");</script> 12</head> 13<body> 14 <article> 15 <h1>The Great HTML5 Parser</h1> 16 <p class="intro"> 17 This is the <em>introduction</em> to an article about 18 <strong>HTML parsing</strong> in OCaml. 19 </p> 20 <p class="content"> 21 The parser follows the WHATWG specification and handles 22 all kinds of malformed HTML gracefully. 23 </p> 24 <ul> 25 <li>Feature 1: Fast parsing</li> 26 <li>Feature 2: CSS selectors</li> 27 <li>Feature 3: Encoding detection</li> 28 </ul> 29 </article> 30 <footer> 31 <p>Copyright 2024</p> 32 </footer> 33</body> 34</html> 35|} 36 37let () = 38 let result = Html5rw.parse (Bytes.Reader.of_string html) in 39 40 (* Extract all text *) 41 Printf.printf "=== All Text (default) ===\n"; 42 let text = Html5rw.to_text result in 43 Printf.printf "%s\n\n" text; 44 45 (* Extract text with custom separator *) 46 Printf.printf "=== Text with Newline Separator ===\n"; 47 let text = Html5rw.to_text ~separator:"\n" result in 48 Printf.printf "%s\n\n" text; 49 50 (* Extract text from specific element *) 51 Printf.printf "=== Article Text Only ===\n"; 52 let articles = Html5rw.query result "article" in 53 List.iter (fun article -> 54 let text = Html5rw.get_text_content article in 55 Printf.printf "%s\n" text 56 ) articles; 57 58 (* Extract structured data *) 59 Printf.printf "\n=== Structured Extraction ===\n"; 60 let headings = Html5rw.query result "h1" in 61 List.iter (fun h -> 62 Printf.printf "Title: %s\n" (Html5rw.get_text_content h) 63 ) headings; 64 65 let items = Html5rw.query result "li" in 66 Printf.printf "Features:\n"; 67 List.iter (fun li -> 68 Printf.printf " - %s\n" (Html5rw.get_text_content li) 69 ) items