personal activity index (bluesky, leaflet, substack) pai.desertthunder.dev
rss bluesky
at main 193 lines 5.8 kB view raw
1use crate::{Item, PaiError, Result, SourceFetcher, SourceKind, Storage, SubstackConfig}; 2use chrono::Utc; 3use feed_rs::parser; 4use tokio::runtime::Runtime; 5 6/// Fetcher for Substack RSS feeds 7/// 8/// Retrieves posts from a Substack publication by parsing its RSS feed. 9/// Maps RSS items to the standardized Item struct for storage. 10pub struct SubstackFetcher { 11 config: SubstackConfig, 12 client: reqwest::Client, 13} 14 15impl SubstackFetcher { 16 /// Creates a new Substack fetcher with the given configuration 17 pub fn new(config: SubstackConfig) -> Self { 18 Self { config, client: reqwest::Client::new() } 19 } 20 21 /// Fetches and parses the RSS feed 22 async fn fetch_feed(&self) -> Result<feed_rs::model::Feed> { 23 let feed_url = format!("{}/feed", self.config.base_url); 24 let response = self 25 .client 26 .get(&feed_url) 27 .send() 28 .await 29 .map_err(|e| PaiError::Fetch(format!("Failed to fetch RSS feed: {e}")))?; 30 31 let body = response 32 .text() 33 .await 34 .map_err(|e| PaiError::Fetch(format!("Failed to read response body: {e}")))?; 35 36 parser::parse(body.as_bytes()).map_err(|e| PaiError::Parse(format!("Failed to parse RSS feed: {e}"))) 37 } 38 39 /// Extracts the source ID from the base URL (e.g., "patternmatched.substack.com") 40 fn extract_source_id(&self) -> String { 41 Self::normalize_source_id(&self.config.base_url) 42 } 43 44 pub(crate) fn normalize_source_id(base_url: &str) -> String { 45 base_url 46 .trim_start_matches("https://") 47 .trim_start_matches("http://") 48 .trim_end_matches('/') 49 .to_string() 50 } 51} 52 53impl SourceFetcher for SubstackFetcher { 54 fn sync(&self, storage: &dyn Storage) -> Result<()> { 55 let runtime = Runtime::new().map_err(|e| PaiError::Fetch(format!("Failed to create runtime: {e}")))?; 56 57 runtime.block_on(async { 58 let feed = self.fetch_feed().await?; 59 let source_id = self.extract_source_id(); 60 61 for entry in feed.entries { 62 let id = entry.id.clone(); 63 let url = entry 64 .links 65 .first() 66 .map(|link| link.href.clone()) 67 .unwrap_or_else(|| id.clone()); 68 69 let title = entry.title.as_ref().map(|t| t.content.clone()); 70 let summary = entry.summary.as_ref().map(|s| s.content.clone()); 71 let author = entry.authors.first().map(|a| a.name.clone()); 72 let content_html = entry.content.and_then(|c| c.body); 73 74 let published_at = entry 75 .published 76 .or(entry.updated) 77 .map(|dt| dt.to_rfc3339()) 78 .unwrap_or_else(|| Utc::now().to_rfc3339()); 79 80 let item = Item { 81 id, 82 source_kind: SourceKind::Substack, 83 source_id: source_id.clone(), 84 author, 85 title, 86 summary, 87 url, 88 content_html, 89 published_at, 90 created_at: Utc::now().to_rfc3339(), 91 }; 92 93 storage.insert_or_replace_item(&item)?; 94 } 95 96 Ok(()) 97 }) 98 } 99} 100 101#[cfg(test)] 102mod tests { 103 use super::*; 104 use crate::ListFilter; 105 use std::sync::{Arc, Mutex}; 106 107 #[derive(Clone)] 108 #[allow(dead_code)] 109 struct MockStorage { 110 items: Arc<Mutex<Vec<Item>>>, 111 } 112 113 #[allow(dead_code)] 114 impl MockStorage { 115 fn new() -> Self { 116 Self { items: Arc::new(Mutex::new(Vec::new())) } 117 } 118 119 fn get_items(&self) -> Vec<Item> { 120 self.items.lock().unwrap().clone() 121 } 122 } 123 124 impl Storage for MockStorage { 125 fn insert_or_replace_item(&self, item: &Item) -> Result<()> { 126 self.items.lock().unwrap().push(item.clone()); 127 Ok(()) 128 } 129 130 fn list_items(&self, _filter: &ListFilter) -> Result<Vec<Item>> { 131 Ok(self.items.lock().unwrap().clone()) 132 } 133 } 134 135 #[test] 136 fn extract_source_id_https() { 137 assert_eq!( 138 SubstackFetcher::normalize_source_id("https://patternmatched.substack.com"), 139 "patternmatched.substack.com" 140 ); 141 } 142 143 #[test] 144 fn extract_source_id_http() { 145 assert_eq!( 146 SubstackFetcher::normalize_source_id("http://test.substack.com/"), 147 "test.substack.com" 148 ); 149 } 150 151 #[test] 152 fn parse_valid_rss() { 153 let rss = r#"<?xml version="1.0" encoding="UTF-8"?> 154<rss version="2.0"> 155<channel> 156 <title>Test Feed</title> 157 <link>https://test.substack.com</link> 158 <description>Test</description> 159 <item> 160 <title>Test Post</title> 161 <link>https://test.substack.com/p/test-post</link> 162 <guid>test-guid</guid> 163 <pubDate>Mon, 01 Jan 2024 12:00:00 +0000</pubDate> 164 <description>Test summary</description> 165 </item> 166</channel> 167</rss>"#; 168 169 let feed = parser::parse(rss.as_bytes()).unwrap(); 170 assert_eq!(feed.entries.len(), 1); 171 assert_eq!(feed.entries[0].title.as_ref().unwrap().content, "Test Post"); 172 } 173 174 #[test] 175 fn parse_invalid_rss() { 176 let invalid_rss = "this is not valid XML"; 177 let result = parser::parse(invalid_rss.as_bytes()); 178 assert!(result.is_err()); 179 } 180 181 #[test] 182 fn parse_empty_rss() { 183 let rss = r#"<?xml version="1.0" encoding="UTF-8"?> 184<rss version="2.0"> 185<channel> 186 <title>Test Feed</title> 187</channel> 188</rss>"#; 189 190 let feed = parser::parse(rss.as_bytes()).unwrap(); 191 assert_eq!(feed.entries.len(), 0); 192 } 193}