examples/text_extraction.ml at main · anil.recoil.org/ocaml-html5rw

OCaml HTML5 parser/serialiser based on Python's JustHTML
ocaml-html5rw / examples / text_extraction.ml
at main 1.8 kB view raw
 1open Bytesrw
 2
 3(* Text extraction example *)
 4
 5let html = {|
 6<!DOCTYPE html>
 7<html>
 8<head>
 9  <title>Article</title>
10  <style>body { font-family: sans-serif; }</style>
11  <script>console.log("Hello");</script>
12</head>
13<body>
14  <article>
15    <h1>The Great HTML5 Parser</h1>
16    <p class="intro">
17      This is the <em>introduction</em> to an article about
18      <strong>HTML parsing</strong> in OCaml.
19    </p>
20    <p class="content">
21      The parser follows the WHATWG specification and handles
22      all kinds of malformed HTML gracefully.
23    </p>
24    <ul>
25      <li>Feature 1: Fast parsing</li>
26      <li>Feature 2: CSS selectors</li>
27      <li>Feature 3: Encoding detection</li>
28    </ul>
29  </article>
30  <footer>
31    <p>Copyright 2024</p>
32  </footer>
33</body>
34</html>
35|}
36
37let () =
38  let result = Html5rw.parse (Bytes.Reader.of_string html) in
39
40  (* Extract all text *)
41  Printf.printf "=== All Text (default) ===\n";
42  let text = Html5rw.to_text result in
43  Printf.printf "%s\n\n" text;
44
45  (* Extract text with custom separator *)
46  Printf.printf "=== Text with Newline Separator ===\n";
47  let text = Html5rw.to_text ~separator:"\n" result in
48  Printf.printf "%s\n\n" text;
49
50  (* Extract text from specific element *)
51  Printf.printf "=== Article Text Only ===\n";
52  let articles = Html5rw.query result "article" in
53  List.iter (fun article ->
54    let text = Html5rw.get_text_content article in
55    Printf.printf "%s\n" text
56  ) articles;
57
58  (* Extract structured data *)
59  Printf.printf "\n=== Structured Extraction ===\n";
60  let headings = Html5rw.query result "h1" in
61  List.iter (fun h ->
62    Printf.printf "Title: %s\n" (Html5rw.get_text_content h)
63  ) headings;
64
65  let items = Html5rw.query result "li" in
66  Printf.printf "Features:\n";
67  List.iter (fun li ->
68    Printf.printf "  - %s\n" (Html5rw.get_text_content li)
69  ) items