OCaml HTML5 parser/serialiser based on Python's JustHTML
1open Bytesrw
2
3(* Text extraction example *)
4
5let html = {|
6<!DOCTYPE html>
7<html>
8<head>
9 <title>Article</title>
10 <style>body { font-family: sans-serif; }</style>
11 <script>console.log("Hello");</script>
12</head>
13<body>
14 <article>
15 <h1>The Great HTML5 Parser</h1>
16 <p class="intro">
17 This is the <em>introduction</em> to an article about
18 <strong>HTML parsing</strong> in OCaml.
19 </p>
20 <p class="content">
21 The parser follows the WHATWG specification and handles
22 all kinds of malformed HTML gracefully.
23 </p>
24 <ul>
25 <li>Feature 1: Fast parsing</li>
26 <li>Feature 2: CSS selectors</li>
27 <li>Feature 3: Encoding detection</li>
28 </ul>
29 </article>
30 <footer>
31 <p>Copyright 2024</p>
32 </footer>
33</body>
34</html>
35|}
36
37let () =
38 let result = Html5rw.parse (Bytes.Reader.of_string html) in
39
40 (* Extract all text *)
41 Printf.printf "=== All Text (default) ===\n";
42 let text = Html5rw.to_text result in
43 Printf.printf "%s\n\n" text;
44
45 (* Extract text with custom separator *)
46 Printf.printf "=== Text with Newline Separator ===\n";
47 let text = Html5rw.to_text ~separator:"\n" result in
48 Printf.printf "%s\n\n" text;
49
50 (* Extract text from specific element *)
51 Printf.printf "=== Article Text Only ===\n";
52 let articles = Html5rw.query result "article" in
53 List.iter (fun article ->
54 let text = Html5rw.get_text_content article in
55 Printf.printf "%s\n" text
56 ) articles;
57
58 (* Extract structured data *)
59 Printf.printf "\n=== Structured Extraction ===\n";
60 let headings = Html5rw.query result "h1" in
61 List.iter (fun h ->
62 Printf.printf "Title: %s\n" (Html5rw.get_text_content h)
63 ) headings;
64
65 let items = Html5rw.query result "li" in
66 Printf.printf "Features:\n";
67 List.iter (fun li ->
68 Printf.printf " - %s\n" (Html5rw.get_text_content li)
69 ) items