use std::{ sync::{Arc, LazyLock}, time::Duration, }; use anyhow::Context; use reqwest::{Client, Url}; use scraper::{ElementRef, Html, Selector}; use serde::Deserialize; use tokio::sync::RwLock; use tracing::instrument; use super::{ Media, cached::{MediaCache, cache_or_fetch, try_cache_or_fetch}, }; #[derive(Deserialize, Debug, Clone)] pub struct ImageUrlMetadata { url: String, } struct Extracted { name: String, image_url: Url, rating: Option, url: String, } // CloudFlare's bot detection seems to be more generous towards user agents that don't include // known HTTP clients, like reqwest or curl. const USER_AGENT: &str = "myivo/1.0.0"; pub async fn fetch() -> anyhow::Result { let client = Client::builder() .user_agent(USER_AGENT) .build() .context("failed to build client")?; let page_url = Url::parse("https://letterboxd.com/ivom/films/diary/") .context("wrote invalid Letterboxd URL")?; let html = client .get(page_url.clone()) // including this header seems to contribute to getting past CloudFlare's bot detection. .header("priority", "u=0, i") .send() .await .context("failed to fetch Letterboxd page")? .text() .await .context("failed to get HTML text")?; let Extracted { name, image_url, rating, url, } = parse_html(&html)?; let image_url_data: ImageUrlMetadata = client .get(image_url.clone()) .send() .await .with_context(|| format!("failed to fetch image metadata from URL {image_url}"))? .json() .await .context("failed to parse image metadata")?; let formatted_rating = match rating { Some(rating) => format!( "{} {}", f32::from(rating) / 2.0, if rating == 2 { "star" } else { "stars" } ), None => "no rating".to_owned(), }; let url = page_url.join(&url).context("film URL was invalid")?; Ok(Media { name, image: image_url_data.url, context: formatted_rating, url: url.into(), }) } fn parse_html(html: &str) -> anyhow::Result { static FIRST_ENTRY_SEL: LazyLock = LazyLock::new(|| Selector::parse(".diary-entry-row:first-child").unwrap()); static NAME_SEL: LazyLock = LazyLock::new(|| Selector::parse(".name").unwrap()); static POSTER_COMPONENT_SEL: LazyLock = LazyLock::new(|| Selector::parse(".react-component:has(> .poster)").unwrap()); static RATING_SEL: LazyLock = LazyLock::new(|| Selector::parse(".rating").unwrap()); static URL_SEL: LazyLock = LazyLock::new(|| Selector::parse(".inline-production-masthead .name a").unwrap()); let document = Html::parse_document(html); let first_entry = document .select(&FIRST_ENTRY_SEL) .next() .context("couldn't find any journal entries")?; let name = first_entry .select(&NAME_SEL) .next() .context("couldn't find name element")? .text() .next() .context("name element didn't have any text")? .to_owned(); let poster_component = first_entry .select(&POSTER_COMPONENT_SEL) .next() .context("couldn't find post component")?; let rating = first_entry .select(&RATING_SEL) .next() .context("couldn't find rating component")? .value() .classes() .find_map(|class| class.strip_prefix("rated-")) .and_then(|rating| rating.parse().ok()); let url = first_entry .select(&URL_SEL) .next() .context("couldn't find film URL element")? .attr("href") .context("film URL element didn't have a URL")? .to_owned(); let image_url = build_image_url(poster_component)?; Ok(Extracted { name, image_url, rating, url, }) } fn build_image_url(poster_component: ElementRef) -> anyhow::Result { let film_path = poster_component .attr("data-item-link") .context("poster component didn't have an image URL path")?; let cache_key = poster_component.attr("data-cache-busting-key"); let image_size = 230; let image_url = format!("https://letterboxd.com{film_path}/poster/std/{image_size}/",); let mut image_url = Url::parse(&image_url).with_context(|| format!("failed to parse URL {image_url}"))?; if let Some(cache_key) = cache_key { image_url.query_pairs_mut().append_pair("k", cache_key); } Ok(image_url) } static CACHE: LazyLock = LazyLock::new(|| Arc::new(RwLock::new(None))); static TTL: Duration = Duration::from_secs(1800); #[instrument(name = "letterboxd_try_cached_fetch")] pub fn try_cached_fetch() -> Option { try_cache_or_fetch(&CACHE, TTL, fetch) } #[instrument(name = "letterboxd_cached_fetch")] pub async fn cached_fetch() -> Option { cache_or_fetch(&CACHE, TTL, fetch).await }