···11+# 0.4 - 2024-11-08
22+33+- Replace ocamlnet HTML parser with Lambda Soup (#15, @aantron)
44+55+# 0.3 - 2023-11-21
66+77+- Fall back to entry id if entry links doesn't exist (#11, @sabine)
88+99+# 0.2 - 2022-04-14
1010+1111+- Build with dune.
1212+- Make the types abstract and add accessor functions.
1313+- Support fetching meta description and SEO image from the posts links.
1414+1515+# 0.1.3 - 2015-07-28
1616+1717+- Make river compatible with the latest syndic API
1818+1919+# 0.1.2 - 2015-03-24
2020+2121+- Refactoring modules.
2222+2323+# 0.1.1 - 2015-03-19
2424+2525+- Upgrading version number.
2626+2727+# 0.1 - 2015-03-15
2828+2929+- Initial release
+13
stack/river/LICENSE
···11+Copyright (c) 2015, KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
22+33+Permission to use, copy, modify, and/or distribute this software for any
44+purpose with or without fee is hereby granted, provided that the above
55+copyright notice and this permission notice appear in all copies.
66+77+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
88+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
99+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1010+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1111+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1212+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1313+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+43
stack/river/Makefile
···11+.DEFAULT_GOAL := all
22+33+.PHONY: all
44+all:
55+ opam exec -- dune build --root . @install
66+77+.PHONY: deps
88+deps: ## Install development dependencies
99+ opam install -y dune-release ocamlformat utop ocaml-lsp-server
1010+ opam install --deps-only --with-test --with-doc -y .
1111+1212+.PHONY: create_switch
1313+create_switch: ## Create an opam switch without any dependency
1414+ opam switch create . --no-install -y
1515+1616+.PHONY: switch
1717+switch: ## Create an opam switch and install development dependencies
1818+ opam install . --deps-only --with-doc --with-test
1919+ opam install -y dune-release ocamlformat utop ocaml-lsp-server
2020+2121+.PHONY: build
2222+build: ## Build the project, including non installable libraries and executables
2323+ opam exec -- dune build --root .
2424+2525+.PHONY: test
2626+test: ## Run the unit tests
2727+ opam exec -- dune runtest --root .
2828+2929+.PHONY: clean
3030+clean: ## Clean build artifacts and other generated files
3131+ opam exec -- dune clean --root .
3232+3333+.PHONY: doc
3434+doc: ## Generate odoc documentation
3535+ opam exec -- dune build --root . @doc
3636+3737+.PHONY: fmt
3838+fmt: ## Format the codebase with ocamlformat
3939+ opam exec -- dune build --root . --auto-promote @fmt
4040+4141+.PHONY: watch
4242+watch: ## Watch for the filesystem and rebuild on every change
4343+ opam exec -- dune build --root . --watch
+58
stack/river/README.md
···11+# River
22+33+[](https://github.com/kayceesrk/river/actions)
44+55+RSS2 and Atom feed aggregator for OCaml
66+77+88+## Features
99+1010+- Performs deduplication.
1111+- Supports pagination and generating well-formed html prefix snippets.
1212+- Support for generating aggregate feeds.
1313+- Sorts the posts from most recent to oldest.
1414+- Depends on Lambda Soup for html parsing.
1515+1616+## Installation
1717+1818+```bash
1919+opam install river
2020+```
2121+2222+## Usage
2323+2424+Here's an example program that aggregates the feeds from different sources:
2525+2626+```ocaml
2727+let sources =
2828+ River.
2929+ [
3030+ { name = "KC Sivaramakrishnan"; url = "http://kcsrk.info/atom-ocaml.xml" };
3131+ {
3232+ name = "Amir Chaudhry";
3333+ url = "http://amirchaudhry.com/tags/ocamllabs-atom.xml";
3434+ };
3535+ ]
3636+3737+let () =
3838+ let feeds = List.map River.fetch sources in
3939+ let posts = River.posts feeds in
4040+ let entries = River.create_atom_entries posts in
4141+ let feed =
4242+ let authors = [ Syndic.Atom.author "OCaml Blog" ] in
4343+ let id = Uri.of_string "https://ocaml.org/atom.xml" in
4444+ let links = [ Syndic.Atom.link ~rel:Self id ] in
4545+ let title : Syndic.Atom.text_construct =
4646+ Text "OCaml Blog: Read the latest OCaml news from the community."
4747+ in
4848+ let updated = Ptime.of_float_s (Unix.gettimeofday ()) |> Option.get in
4949+ Syndic.Atom.feed ~authors ~links ~id ~title ~updated entries
5050+ in
5151+ let out_channel = open_out "example/atom.xml" in
5252+ Syndic.Atom.output feed (`Channel out_channel);
5353+ close_out out_channel
5454+```
5555+5656+## Contributing
5757+5858+Take a look at our [Contributing Guide](CONTRIBUTING.md).
···11+let sources =
22+ River.
33+ [
44+ { name = "KC Sivaramakrishnan"; url = "http://kcsrk.info/atom-ocaml.xml" };
55+ {
66+ name = "Amir Chaudhry";
77+ url = "http://amirchaudhry.com/tags/ocamllabs-atom.xml";
88+ };
99+ ]
1010+1111+let main () =
1212+ let feeds = List.map River.fetch sources in
1313+ let posts = River.posts feeds in
1414+ let entries = River.create_atom_entries posts in
1515+ let feed =
1616+ let authors = [ Syndic.Atom.author "OCaml Blog" ] in
1717+ let id = Uri.of_string "https://ocaml.org/atom.xml" in
1818+ let links = [ Syndic.Atom.link ~rel:Self id ] in
1919+ let title : Syndic.Atom.text_construct =
2020+ Text "OCaml Blog: Read the latest OCaml news from the community."
2121+ in
2222+ let updated = Ptime.of_float_s (Unix.gettimeofday ()) |> Option.get in
2323+ Syndic.Atom.feed ~authors ~links ~id ~title ~updated entries
2424+ in
2525+ let out_channel = open_out "example/atom.xml" in
2626+ Syndic.Atom.output feed (`Channel out_channel);
2727+ close_out out_channel
2828+2929+let () =
3030+ Printexc.record_backtrace true;
3131+ main ()
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+type source = { name : string; url : string }
1919+type content = Atom of Syndic.Atom.feed | Rss2 of Syndic.Rss2.channel
2020+2121+let string_of_feed = function Atom _ -> "Atom" | Rss2 _ -> "Rss2"
2222+2323+type t = { name : string; title : string; url : string; content : content }
2424+2525+let classify_feed ~xmlbase (xml : string) =
2626+ try Atom (Syndic.Atom.parse ~xmlbase (Xmlm.make_input (`String (0, xml))))
2727+ with Syndic.Atom.Error.Error _ -> (
2828+ try Rss2 (Syndic.Rss2.parse ~xmlbase (Xmlm.make_input (`String (0, xml))))
2929+ with Syndic.Rss2.Error.Error _ -> failwith "Neither Atom nor RSS2 feed")
3030+3131+let fetch (source : source) =
3232+ let xmlbase = Uri.of_string @@ source.url in
3333+ let response = Http.get source.url in
3434+ let content = classify_feed ~xmlbase response in
3535+ let title =
3636+ match content with
3737+ | Atom atom -> Util.string_of_text_construct atom.Syndic.Atom.title
3838+ | Rss2 ch -> ch.Syndic.Rss2.title
3939+ in
4040+ { name = source.name; title; content; url = source.url }
+73
stack/river/lib/http.ml
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+(* Download urls and cache them — especially during development, it slows down
1919+ the rendering to download over and over again the same URL. *)
2020+2121+open Printf
2222+open Lwt
2323+open Cohttp
2424+open Cohttp.Response
2525+open Cohttp.Code
2626+2727+exception Status_unhandled of string
2828+exception Timeout
2929+3030+let max_num_redirects = 5
3131+3232+let get_location_exn headers =
3333+ match Header.get headers "location" with
3434+ | Some x -> x
3535+ | None -> raise @@ Status_unhandled "Location HTTP header not found"
3636+3737+let rec get_uri uri = function
3838+ | 0 -> raise (Status_unhandled "Too many redirects")
3939+ | n ->
4040+ let main =
4141+ Cohttp_lwt_unix.Client.get uri >>= fun (resp, body) ->
4242+ match resp.status with
4343+ | `OK -> Cohttp_lwt.Body.to_string body
4444+ | `Found | `See_other | `Moved_permanently | `Temporary_redirect
4545+ | `Permanent_redirect -> (
4646+ let l = Uri.of_string @@ get_location_exn resp.headers in
4747+ match Uri.host l with
4848+ | Some _ -> get_uri l (n - 1)
4949+ | None ->
5050+ let host = Uri.host uri in
5151+ let scheme = Uri.scheme uri in
5252+ let new_uri = Uri.with_scheme (Uri.with_host l host) scheme in
5353+ get_uri new_uri (n - 1))
5454+ | _ -> raise @@ Status_unhandled (string_of_status resp.status)
5555+ in
5656+ let timeout =
5757+ Lwt_unix.sleep (float_of_int 3) >>= fun () -> Lwt.fail Timeout
5858+ in
5959+ Lwt.pick [ main; timeout ]
6060+6161+let get url =
6262+ eprintf "Downloading %s ... %!" url;
6363+ try
6464+ let data = Lwt_main.run @@ get_uri (Uri.of_string url) max_num_redirects in
6565+ eprintf "done %!\n";
6666+ data
6767+ with
6868+ | (Status_unhandled s | Failure s) as e ->
6969+ eprintf "Failed: %s\n" s;
7070+ raise e
7171+ | Timeout as e ->
7272+ eprintf "Failed: Timeout\n";
7373+ raise e
+28
stack/river/lib/http.mli
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+exception Status_unhandled of string
1919+exception Timeout
2020+2121+val get : string -> string
2222+(** [get uri] returns the body of the response of the HTTP GET request on [uri].
2323+2424+ If the answer of is a redirection, it will follow the redirections up to 5
2525+ redirects.
2626+2727+ The answer is cached for [cache_secs] seconds, where [cache_secs] is 3600
2828+ seconds (1 hour) by default. *)
+80
stack/river/lib/meta.ml
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+(** This module determines an image to be used as preview of a website.
1919+2020+ It does this by following the same logic Google+ and other websites use, and
2121+ described in this article:
2222+ https://www.raymondcamden.com/2011/07/26/How-are-Facebook-and-Google-creating-link-previews *)
2323+2424+let og_image html =
2525+ let open Soup in
2626+ let soup = parse html in
2727+ try soup $ "meta[property=og:image]" |> R.attribute "content" |> Option.some
2828+ with Failure _ -> None
2929+3030+let image_src html =
3131+ let open Soup in
3232+ let soup = parse html in
3333+ try soup $ "link[rel=\"image_src\"]" |> R.attribute "href" |> Option.some
3434+ with Failure _ -> None
3535+3636+let twitter_image html =
3737+ let open Soup in
3838+ let soup = parse html in
3939+ try
4040+ soup $ "meta[name=\"twitter:image\"]" |> R.attribute "content"
4141+ |> Option.some
4242+ with Failure _ -> None
4343+4444+let og_description html =
4545+ let open Soup in
4646+ let soup = parse html in
4747+ try
4848+ soup $ "meta[property=og:description]" |> R.attribute "content"
4949+ |> Option.some
5050+ with Failure _ -> None
5151+5252+let description html =
5353+ let open Soup in
5454+ let soup = parse html in
5555+ try
5656+ soup $ "meta[property=description]" |> R.attribute "content" |> Option.some
5757+ with Failure _ -> None
5858+5959+let preview_image html =
6060+ let preview_image =
6161+ match og_image html with
6262+ | None -> (
6363+ match image_src html with
6464+ | None -> twitter_image html
6565+ | Some x -> Some x)
6666+ | Some x -> Some x
6767+ in
6868+ match Option.map String.trim preview_image with
6969+ | Some "" -> None
7070+ | Some x -> Some x
7171+ | None -> None
7272+7373+let description html =
7474+ let preview_image =
7575+ match og_description html with None -> description html | Some x -> Some x
7676+ in
7777+ match Option.map String.trim preview_image with
7878+ | Some "" -> None
7979+ | Some x -> Some x
8080+ | None -> None
+215
stack/river/lib/post.ml
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+type t = {
1919+ title : string;
2020+ link : Uri.t option;
2121+ date : Syndic.Date.t option;
2222+ feed : Feed.t;
2323+ author : string;
2424+ email : string;
2525+ content : Soup.soup Soup.node;
2626+ mutable link_response : (string, string) result option;
2727+}
2828+2929+let resolve_links_attr ~xmlbase attr el =
3030+ Soup.R.attribute attr el
3131+ |> Uri.of_string
3232+ |> Syndic.XML.resolve ~xmlbase
3333+ |> Uri.to_string
3434+ |> fun value -> Soup.set_attribute attr value el
3535+3636+(* Things that posts should not contain *)
3737+let undesired_tags = [ "style"; "script" ]
3838+let undesired_attr = [ "id" ]
3939+4040+let html_of_text ?xmlbase s =
4141+ let soup = Soup.parse s in
4242+ let ($$) = Soup.($$) in
4343+ soup $$ "a[href]" |> Soup.iter (resolve_links_attr ~xmlbase "href");
4444+ soup $$ "img[src]" |> Soup.iter (resolve_links_attr ~xmlbase "src");
4545+ undesired_tags |> List.iter (fun tag -> soup $$ tag |> Soup.iter Soup.delete);
4646+ soup $$ "*" |> Soup.iter (fun el ->
4747+ undesired_attr |> List.iter (fun attr -> Soup.delete_attribute attr el));
4848+ soup
4949+5050+(* Do not trust sites using XML for HTML content. Convert to string and parse
5151+ back. (Does not always fix bad HTML unfortunately.) *)
5252+let html_of_syndic =
5353+ let ns_prefix _ = Some "" in
5454+ fun ?xmlbase h ->
5555+ html_of_text ?xmlbase
5656+ (String.concat "" (List.map (Syndic.XML.to_string ~ns_prefix) h))
5757+5858+let string_of_option = function None -> "" | Some s -> s
5959+6060+(* Email on the forge contain the name in parenthesis *)
6161+let forge_name_re = Str.regexp ".*(\\([^()]*\\))"
6262+6363+let post_compare p1 p2 =
6464+ (* Most recent posts first. Posts with no date are always last *)
6565+ match (p1.date, p2.date) with
6666+ | Some d1, Some d2 -> Syndic.Date.compare d2 d1
6767+ | None, Some _ -> 1
6868+ | Some _, None -> -1
6969+ | None, None -> 1
7070+7171+let rec remove n l =
7272+ if n <= 0 then l else match l with [] -> [] | _ :: tl -> remove (n - 1) tl
7373+7474+let rec take n = function
7575+ | [] -> []
7676+ | e :: tl -> if n > 0 then e :: take (n - 1) tl else []
7777+7878+(* Blog feed
7979+ ***********************************************************************)
8080+8181+let post_of_atom ~(feed : Feed.t) (e : Syndic.Atom.entry) =
8282+ let link =
8383+ try
8484+ Some
8585+ (List.find (fun l -> l.Syndic.Atom.rel = Syndic.Atom.Alternate) e.links)
8686+ .href
8787+ with Not_found -> (
8888+ match e.links with
8989+ | l :: _ -> Some l.href
9090+ | [] -> (
9191+ match Uri.scheme e.id with
9292+ | Some "http" -> Some e.id
9393+ | Some "https" -> Some e.id
9494+ | _ -> None))
9595+ in
9696+ let date =
9797+ match e.published with Some _ -> e.published | None -> Some e.updated
9898+ in
9999+ let content =
100100+ match e.content with
101101+ | Some (Text s) -> html_of_text s
102102+ | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s
103103+ | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h
104104+ | Some (Mime _) | Some (Src _) | None -> (
105105+ match e.summary with
106106+ | Some (Text s) -> html_of_text s
107107+ | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s
108108+ | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h
109109+ | None -> Soup.parse "")
110110+ in
111111+ let author, _ = e.authors in
112112+ {
113113+ title = Util.string_of_text_construct e.title;
114114+ link;
115115+ date;
116116+ feed;
117117+ author = author.name;
118118+ email = "";
119119+ content;
120120+ link_response = None;
121121+ }
122122+123123+let post_of_rss2 ~(feed : Feed.t) it =
124124+ let title, content =
125125+ match it.Syndic.Rss2.story with
126126+ | All (t, xmlbase, d) -> (
127127+ ( t,
128128+ match it.content with
129129+ | _, "" -> html_of_text ?xmlbase d
130130+ | xmlbase, c -> html_of_text ?xmlbase c ))
131131+ | Title t ->
132132+ let xmlbase, c = it.content in
133133+ (t, html_of_text ?xmlbase c)
134134+ | Description (xmlbase, d) -> (
135135+ ( "",
136136+ match it.content with
137137+ | _, "" -> html_of_text ?xmlbase d
138138+ | xmlbase, c -> html_of_text ?xmlbase c ))
139139+ in
140140+ let link =
141141+ match (it.guid, it.link) with
142142+ | Some u, _ when u.permalink -> Some u.data
143143+ | _, Some _ -> it.link
144144+ | Some u, _ ->
145145+ (* Sometimes the guid is indicated with isPermaLink="false" but is
146146+ nonetheless the only URL we get (e.g. ocamlpro). *)
147147+ Some u.data
148148+ | None, None -> None
149149+ in
150150+ {
151151+ title;
152152+ link;
153153+ feed;
154154+ author = feed.name;
155155+ email = string_of_option it.author;
156156+ content;
157157+ date = it.pubDate;
158158+ link_response = None;
159159+ }
160160+161161+let posts_of_feed c =
162162+ match c.Feed.content with
163163+ | Feed.Atom f -> List.map (post_of_atom ~feed:c) f.Syndic.Atom.entries
164164+ | Feed.Rss2 ch -> List.map (post_of_rss2 ~feed:c) ch.Syndic.Rss2.items
165165+166166+let mk_entry post =
167167+ let content = Syndic.Atom.Html (None, Soup.to_string post.content) in
168168+ let contributors =
169169+ [ Syndic.Atom.author ~uri:(Uri.of_string post.feed.url) post.feed.name ]
170170+ in
171171+ let links =
172172+ match post.link with
173173+ | Some l -> [ Syndic.Atom.link ~rel:Syndic.Atom.Alternate l ]
174174+ | None -> []
175175+ in
176176+ (* TODO: include source *)
177177+ let id =
178178+ match post.link with
179179+ | Some l -> l
180180+ | None -> Uri.of_string (Digest.to_hex (Digest.string post.title))
181181+ in
182182+ let authors = (Syndic.Atom.author ~email:post.email post.author, []) in
183183+ let title : Syndic.Atom.text_construct = Syndic.Atom.Text post.title in
184184+ let updated =
185185+ match post.date with
186186+ (* Atom entry requires a date but RSS2 does not. So if a date
187187+ * is not available, just capture the current date. *)
188188+ | None -> Ptime.of_float_s (Unix.gettimeofday ()) |> Option.get
189189+ | Some d -> d
190190+ in
191191+ Syndic.Atom.entry ~content ~contributors ~links ~id ~authors ~title ~updated
192192+ ()
193193+194194+let mk_entries posts = List.map mk_entry posts
195195+196196+let get_posts ?n ?(ofs = 0) planet_feeds =
197197+ let posts = List.concat @@ List.map posts_of_feed planet_feeds in
198198+ let posts = List.sort post_compare posts in
199199+ let posts = remove ofs posts in
200200+ match n with None -> posts | Some n -> take n posts
201201+202202+(* Fetch the link response and cache it. *)
203203+let fetch_link t =
204204+ match (t.link, t.link_response) with
205205+ | None, _ -> None
206206+ | Some _, Some (Ok x) -> Some x
207207+ | Some _, Some (Error _) -> None
208208+ | Some link, None -> (
209209+ try
210210+ let response = Http.get (Uri.to_string link) in
211211+ t.link_response <- Some (Ok response);
212212+ Some response
213213+ with _exn ->
214214+ t.link_response <- Some (Error "");
215215+ None)
+44
stack/river/lib/river.ml
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+type source = Feed.source = { name : string; url : string }
1919+type feed = Feed.t
2020+type post = Post.t
2121+2222+let fetch = Feed.fetch
2323+let name feed = feed.Feed.name
2424+let url feed = feed.Feed.url
2525+let posts feeds = Post.get_posts feeds
2626+let title post = post.Post.title
2727+let link post = post.Post.link
2828+let date post = post.Post.date
2929+let feed post = post.Post.feed
3030+let author post = post.Post.author
3131+let email post = post.Post.email
3232+let content post = Soup.to_string post.Post.content
3333+3434+let meta_description post =
3535+ match Post.fetch_link post with
3636+ | None -> None
3737+ | Some response -> Meta.description response
3838+3939+let seo_image post =
4040+ match Post.fetch_link post with
4141+ | None -> None
4242+ | Some response -> Meta.preview_image response
4343+4444+let create_atom_entries = Post.mk_entries
+73
stack/river/lib/river.mli
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+type source = { name : string; url : string }
1919+(** The source of a feed. *)
2020+2121+type feed
2222+type post
2323+2424+val fetch : source -> feed
2525+(** [fetch source] returns an Atom or RSS feed from a source. *)
2626+2727+val name : feed -> string
2828+(** [name feed] is the name of the feed source passed to [fetch]. *)
2929+3030+val url : feed -> string
3131+(** [url feed] is the url of the feed source passed to [fetch]. *)
3232+3333+val posts : feed list -> post list
3434+(** [posts feeds] is the list of deduplicated posts of the given feeds. *)
3535+3636+val feed : post -> feed
3737+(** [feed post] is the feed the post originates from. *)
3838+3939+val title : post -> string
4040+(** [title post] is the title of the post. *)
4141+4242+val link : post -> Uri.t option
4343+(** [link post] is the link of the post. *)
4444+4545+val date : post -> Syndic.Date.t option
4646+(** [date post] is the date of the post. *)
4747+4848+val author : post -> string
4949+(** [author post] is the author of the post. *)
5050+5151+val email : post -> string
5252+(** [email post] is the email of the post. *)
5353+5454+val content : post -> string
5555+(** [content post] is the content of the post. *)
5656+5757+val meta_description : post -> string option
5858+(** [meta_description post] is the meta description of the post on the origin
5959+ site.
6060+6161+ To get the meta description, we make get the content of [link post] and look
6262+ for an HTML meta tag with the name "description" or "og:description".*)
6363+6464+val seo_image : post -> string option
6565+(** [seo_image post] is the image to be used by social networks and links to the
6666+ post.
6767+6868+ To get the seo image, we make get the content of [link post] and look for an
6969+ HTML meta tag with the name "og:image" or "twitter:image". *)
7070+7171+val create_atom_entries : post list -> Syndic.Atom.entry list
7272+(** [create_atom_feed posts] creates a list of atom entries, which can then be
7373+ used to create an atom feed that is an aggregate of the posts. *)
+33
stack/river/lib/util.ml
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+open Syndic
1919+2020+(* Remove all tags *)
2121+let rec syndic_to_buffer b = function
2222+ | XML.Node (_, _, subs) -> List.iter (syndic_to_buffer b) subs
2323+ | XML.Data (_, d) -> Buffer.add_string b d
2424+2525+let syndic_to_string x =
2626+ let b = Buffer.create 1024 in
2727+ List.iter (syndic_to_buffer b) x;
2828+ Buffer.contents b
2929+3030+let string_of_text_construct : Atom.text_construct -> string = function
3131+ (* FIXME: we probably would like to parse the HTML and remove the tags *)
3232+ | Atom.Text s | Atom.Html (_, s) -> s
3333+ | Atom.Xhtml (_, x) -> syndic_to_string x