My personal site cherry.computer
htmx tailwind axum askama
at main 163 lines 5.1 kB view raw
1use std::{ 2 sync::{Arc, LazyLock}, 3 time::Duration, 4}; 5 6use anyhow::Context; 7use reqwest::{Client, Url}; 8use scraper::{ElementRef, Html, Selector}; 9use serde::Deserialize; 10use tokio::sync::RwLock; 11use tracing::instrument; 12 13use super::{ 14 Media, 15 cached::{MediaCache, cache_or_fetch, try_cache_or_fetch}, 16}; 17 18#[derive(Deserialize, Debug, Clone)] 19pub struct ImageUrlMetadata { 20 url: String, 21} 22 23struct Extracted { 24 name: String, 25 image_url: Url, 26 rating: Option<u8>, 27 url: String, 28} 29 30// CloudFlare's bot detection seems to be more generous towards user agents that don't include 31// known HTTP clients, like reqwest or curl. 32const USER_AGENT: &str = "myivo/1.0.0"; 33 34pub async fn fetch() -> anyhow::Result<Media> { 35 let client = Client::builder() 36 .user_agent(USER_AGENT) 37 .build() 38 .context("failed to build client")?; 39 let page_url = Url::parse("https://letterboxd.com/ivom/films/diary/") 40 .context("wrote invalid Letterboxd URL")?; 41 let html = client 42 .get(page_url.clone()) 43 // including this header seems to contribute to getting past CloudFlare's bot detection. 44 .header("priority", "u=0, i") 45 .send() 46 .await 47 .context("failed to fetch Letterboxd page")? 48 .text() 49 .await 50 .context("failed to get HTML text")?; 51 let Extracted { 52 name, 53 image_url, 54 rating, 55 url, 56 } = parse_html(&html)?; 57 58 let image_url_data: ImageUrlMetadata = client 59 .get(image_url.clone()) 60 .send() 61 .await 62 .with_context(|| format!("failed to fetch image metadata from URL {image_url}"))? 63 .json() 64 .await 65 .context("failed to parse image metadata")?; 66 let formatted_rating = match rating { 67 Some(rating) => format!( 68 "{} {}", 69 f32::from(rating) / 2.0, 70 if rating == 2 { "star" } else { "stars" } 71 ), 72 None => "no rating".to_owned(), 73 }; 74 let url = page_url.join(&url).context("film URL was invalid")?; 75 76 Ok(Media { 77 name, 78 image: image_url_data.url, 79 context: formatted_rating, 80 url: url.into(), 81 }) 82} 83 84fn parse_html(html: &str) -> anyhow::Result<Extracted> { 85 static FIRST_ENTRY_SEL: LazyLock<Selector> = 86 LazyLock::new(|| Selector::parse(".diary-entry-row:first-child").unwrap()); 87 static NAME_SEL: LazyLock<Selector> = LazyLock::new(|| Selector::parse(".name").unwrap()); 88 static POSTER_COMPONENT_SEL: LazyLock<Selector> = 89 LazyLock::new(|| Selector::parse(".react-component:has(> .poster)").unwrap()); 90 static RATING_SEL: LazyLock<Selector> = LazyLock::new(|| Selector::parse(".rating").unwrap()); 91 static URL_SEL: LazyLock<Selector> = 92 LazyLock::new(|| Selector::parse(".inline-production-masthead .name a").unwrap()); 93 94 let document = Html::parse_document(html); 95 96 let first_entry = document 97 .select(&FIRST_ENTRY_SEL) 98 .next() 99 .context("couldn't find any journal entries")?; 100 let name = first_entry 101 .select(&NAME_SEL) 102 .next() 103 .context("couldn't find name element")? 104 .text() 105 .next() 106 .context("name element didn't have any text")? 107 .to_owned(); 108 let poster_component = first_entry 109 .select(&POSTER_COMPONENT_SEL) 110 .next() 111 .context("couldn't find post component")?; 112 let rating = first_entry 113 .select(&RATING_SEL) 114 .next() 115 .context("couldn't find rating component")? 116 .value() 117 .classes() 118 .find_map(|class| class.strip_prefix("rated-")) 119 .and_then(|rating| rating.parse().ok()); 120 let url = first_entry 121 .select(&URL_SEL) 122 .next() 123 .context("couldn't find film URL element")? 124 .attr("href") 125 .context("film URL element didn't have a URL")? 126 .to_owned(); 127 128 let image_url = build_image_url(poster_component)?; 129 130 Ok(Extracted { 131 name, 132 image_url, 133 rating, 134 url, 135 }) 136} 137 138fn build_image_url(poster_component: ElementRef) -> anyhow::Result<Url> { 139 let film_path = poster_component 140 .attr("data-item-link") 141 .context("poster component didn't have an image URL path")?; 142 let cache_key = poster_component.attr("data-cache-busting-key"); 143 let image_size = 230; 144 let image_url = format!("https://letterboxd.com{film_path}/poster/std/{image_size}/",); 145 let mut image_url = 146 Url::parse(&image_url).with_context(|| format!("failed to parse URL {image_url}"))?; 147 if let Some(cache_key) = cache_key { 148 image_url.query_pairs_mut().append_pair("k", cache_key); 149 } 150 151 Ok(image_url) 152} 153 154static CACHE: LazyLock<MediaCache> = LazyLock::new(|| Arc::new(RwLock::new(None))); 155static TTL: Duration = Duration::from_secs(1800); 156#[instrument(name = "letterboxd_try_cached_fetch")] 157pub fn try_cached_fetch() -> Option<Media> { 158 try_cache_or_fetch(&CACHE, TTL, fetch) 159} 160#[instrument(name = "letterboxd_cached_fetch")] 161pub async fn cached_fetch() -> Option<Media> { 162 cache_or_fetch(&CACHE, TTL, fetch).await 163}