My personal site
cherry.computer
htmx
tailwind
axum
askama
1use std::{
2 sync::{Arc, LazyLock},
3 time::Duration,
4};
5
6use anyhow::Context;
7use reqwest::{Client, Url};
8use scraper::{ElementRef, Html, Selector};
9use serde::Deserialize;
10use tokio::sync::RwLock;
11use tracing::instrument;
12
13use super::{
14 Media,
15 cached::{MediaCache, cache_or_fetch, try_cache_or_fetch},
16};
17
18#[derive(Deserialize, Debug, Clone)]
19pub struct ImageUrlMetadata {
20 url: String,
21}
22
23struct Extracted {
24 name: String,
25 image_url: Url,
26 rating: Option<u8>,
27 url: String,
28}
29
30// CloudFlare's bot detection seems to be more generous towards user agents that don't include
31// known HTTP clients, like reqwest or curl.
32const USER_AGENT: &str = "myivo/1.0.0";
33
34pub async fn fetch() -> anyhow::Result<Media> {
35 let client = Client::builder()
36 .user_agent(USER_AGENT)
37 .build()
38 .context("failed to build client")?;
39 let page_url = Url::parse("https://letterboxd.com/ivom/films/diary/")
40 .context("wrote invalid Letterboxd URL")?;
41 let html = client
42 .get(page_url.clone())
43 // including this header seems to contribute to getting past CloudFlare's bot detection.
44 .header("priority", "u=0, i")
45 .send()
46 .await
47 .context("failed to fetch Letterboxd page")?
48 .text()
49 .await
50 .context("failed to get HTML text")?;
51 let Extracted {
52 name,
53 image_url,
54 rating,
55 url,
56 } = parse_html(&html)?;
57
58 let image_url_data: ImageUrlMetadata = client
59 .get(image_url.clone())
60 .send()
61 .await
62 .with_context(|| format!("failed to fetch image metadata from URL {image_url}"))?
63 .json()
64 .await
65 .context("failed to parse image metadata")?;
66 let formatted_rating = match rating {
67 Some(rating) => format!(
68 "{} {}",
69 f32::from(rating) / 2.0,
70 if rating == 2 { "star" } else { "stars" }
71 ),
72 None => "no rating".to_owned(),
73 };
74 let url = page_url.join(&url).context("film URL was invalid")?;
75
76 Ok(Media {
77 name,
78 image: image_url_data.url,
79 context: formatted_rating,
80 url: url.into(),
81 })
82}
83
84fn parse_html(html: &str) -> anyhow::Result<Extracted> {
85 static FIRST_ENTRY_SEL: LazyLock<Selector> =
86 LazyLock::new(|| Selector::parse(".diary-entry-row:first-child").unwrap());
87 static NAME_SEL: LazyLock<Selector> = LazyLock::new(|| Selector::parse(".name").unwrap());
88 static POSTER_COMPONENT_SEL: LazyLock<Selector> =
89 LazyLock::new(|| Selector::parse(".react-component:has(> .poster)").unwrap());
90 static RATING_SEL: LazyLock<Selector> = LazyLock::new(|| Selector::parse(".rating").unwrap());
91 static URL_SEL: LazyLock<Selector> =
92 LazyLock::new(|| Selector::parse(".inline-production-masthead .name a").unwrap());
93
94 let document = Html::parse_document(html);
95
96 let first_entry = document
97 .select(&FIRST_ENTRY_SEL)
98 .next()
99 .context("couldn't find any journal entries")?;
100 let name = first_entry
101 .select(&NAME_SEL)
102 .next()
103 .context("couldn't find name element")?
104 .text()
105 .next()
106 .context("name element didn't have any text")?
107 .to_owned();
108 let poster_component = first_entry
109 .select(&POSTER_COMPONENT_SEL)
110 .next()
111 .context("couldn't find post component")?;
112 let rating = first_entry
113 .select(&RATING_SEL)
114 .next()
115 .context("couldn't find rating component")?
116 .value()
117 .classes()
118 .find_map(|class| class.strip_prefix("rated-"))
119 .and_then(|rating| rating.parse().ok());
120 let url = first_entry
121 .select(&URL_SEL)
122 .next()
123 .context("couldn't find film URL element")?
124 .attr("href")
125 .context("film URL element didn't have a URL")?
126 .to_owned();
127
128 let image_url = build_image_url(poster_component)?;
129
130 Ok(Extracted {
131 name,
132 image_url,
133 rating,
134 url,
135 })
136}
137
138fn build_image_url(poster_component: ElementRef) -> anyhow::Result<Url> {
139 let film_path = poster_component
140 .attr("data-item-link")
141 .context("poster component didn't have an image URL path")?;
142 let cache_key = poster_component.attr("data-cache-busting-key");
143 let image_size = 230;
144 let image_url = format!("https://letterboxd.com{film_path}/poster/std/{image_size}/",);
145 let mut image_url =
146 Url::parse(&image_url).with_context(|| format!("failed to parse URL {image_url}"))?;
147 if let Some(cache_key) = cache_key {
148 image_url.query_pairs_mut().append_pair("k", cache_key);
149 }
150
151 Ok(image_url)
152}
153
154static CACHE: LazyLock<MediaCache> = LazyLock::new(|| Arc::new(RwLock::new(None)));
155static TTL: Duration = Duration::from_secs(1800);
156#[instrument(name = "letterboxd_try_cached_fetch")]
157pub fn try_cached_fetch() -> Option<Media> {
158 try_cache_or_fetch(&CACHE, TTL, fetch)
159}
160#[instrument(name = "letterboxd_cached_fetch")]
161pub async fn cached_fetch() -> Option<Media> {
162 cache_or_fetch(&CACHE, TTL, fetch).await
163}