core/src/fetchers/substack.rs at main · desertthunder.dev/pai

desertthunder.dev / pai
personal activity index (bluesky, leaflet, substack) pai.desertthunder.dev
rss bluesky
pai / core / src / fetchers / substack.rs
at main 193 lines 5.8 kB view raw
  1use crate::{Item, PaiError, Result, SourceFetcher, SourceKind, Storage, SubstackConfig};
  2use chrono::Utc;
  3use feed_rs::parser;
  4use tokio::runtime::Runtime;
  5
  6/// Fetcher for Substack RSS feeds
  7///
  8/// Retrieves posts from a Substack publication by parsing its RSS feed.
  9/// Maps RSS items to the standardized Item struct for storage.
 10pub struct SubstackFetcher {
 11    config: SubstackConfig,
 12    client: reqwest::Client,
 13}
 14
 15impl SubstackFetcher {
 16    /// Creates a new Substack fetcher with the given configuration
 17    pub fn new(config: SubstackConfig) -> Self {
 18        Self { config, client: reqwest::Client::new() }
 19    }
 20
 21    /// Fetches and parses the RSS feed
 22    async fn fetch_feed(&self) -> Result<feed_rs::model::Feed> {
 23        let feed_url = format!("{}/feed", self.config.base_url);
 24        let response = self
 25            .client
 26            .get(&feed_url)
 27            .send()
 28            .await
 29            .map_err(|e| PaiError::Fetch(format!("Failed to fetch RSS feed: {e}")))?;
 30
 31        let body = response
 32            .text()
 33            .await
 34            .map_err(|e| PaiError::Fetch(format!("Failed to read response body: {e}")))?;
 35
 36        parser::parse(body.as_bytes()).map_err(|e| PaiError::Parse(format!("Failed to parse RSS feed: {e}")))
 37    }
 38
 39    /// Extracts the source ID from the base URL (e.g., "patternmatched.substack.com")
 40    fn extract_source_id(&self) -> String {
 41        Self::normalize_source_id(&self.config.base_url)
 42    }
 43
 44    pub(crate) fn normalize_source_id(base_url: &str) -> String {
 45        base_url
 46            .trim_start_matches("https://")
 47            .trim_start_matches("http://")
 48            .trim_end_matches('/')
 49            .to_string()
 50    }
 51}
 52
 53impl SourceFetcher for SubstackFetcher {
 54    fn sync(&self, storage: &dyn Storage) -> Result<()> {
 55        let runtime = Runtime::new().map_err(|e| PaiError::Fetch(format!("Failed to create runtime: {e}")))?;
 56
 57        runtime.block_on(async {
 58            let feed = self.fetch_feed().await?;
 59            let source_id = self.extract_source_id();
 60
 61            for entry in feed.entries {
 62                let id = entry.id.clone();
 63                let url = entry
 64                    .links
 65                    .first()
 66                    .map(|link| link.href.clone())
 67                    .unwrap_or_else(|| id.clone());
 68
 69                let title = entry.title.as_ref().map(|t| t.content.clone());
 70                let summary = entry.summary.as_ref().map(|s| s.content.clone());
 71                let author = entry.authors.first().map(|a| a.name.clone());
 72                let content_html = entry.content.and_then(|c| c.body);
 73
 74                let published_at = entry
 75                    .published
 76                    .or(entry.updated)
 77                    .map(|dt| dt.to_rfc3339())
 78                    .unwrap_or_else(|| Utc::now().to_rfc3339());
 79
 80                let item = Item {
 81                    id,
 82                    source_kind: SourceKind::Substack,
 83                    source_id: source_id.clone(),
 84                    author,
 85                    title,
 86                    summary,
 87                    url,
 88                    content_html,
 89                    published_at,
 90                    created_at: Utc::now().to_rfc3339(),
 91                };
 92
 93                storage.insert_or_replace_item(&item)?;
 94            }
 95
 96            Ok(())
 97        })
 98    }
 99}
100
101#[cfg(test)]
102mod tests {
103    use super::*;
104    use crate::ListFilter;
105    use std::sync::{Arc, Mutex};
106
107    #[derive(Clone)]
108    #[allow(dead_code)]
109    struct MockStorage {
110        items: Arc<Mutex<Vec<Item>>>,
111    }
112
113    #[allow(dead_code)]
114    impl MockStorage {
115        fn new() -> Self {
116            Self { items: Arc::new(Mutex::new(Vec::new())) }
117        }
118
119        fn get_items(&self) -> Vec<Item> {
120            self.items.lock().unwrap().clone()
121        }
122    }
123
124    impl Storage for MockStorage {
125        fn insert_or_replace_item(&self, item: &Item) -> Result<()> {
126            self.items.lock().unwrap().push(item.clone());
127            Ok(())
128        }
129
130        fn list_items(&self, _filter: &ListFilter) -> Result<Vec<Item>> {
131            Ok(self.items.lock().unwrap().clone())
132        }
133    }
134
135    #[test]
136    fn extract_source_id_https() {
137        assert_eq!(
138            SubstackFetcher::normalize_source_id("https://patternmatched.substack.com"),
139            "patternmatched.substack.com"
140        );
141    }
142
143    #[test]
144    fn extract_source_id_http() {
145        assert_eq!(
146            SubstackFetcher::normalize_source_id("http://test.substack.com/"),
147            "test.substack.com"
148        );
149    }
150
151    #[test]
152    fn parse_valid_rss() {
153        let rss = r#"<?xml version="1.0" encoding="UTF-8"?>
154<rss version="2.0">
155<channel>
156    <title>Test Feed</title>
157    <link>https://test.substack.com</link>
158    <description>Test</description>
159    <item>
160        <title>Test Post</title>
161        <link>https://test.substack.com/p/test-post</link>
162        <guid>test-guid</guid>
163        <pubDate>Mon, 01 Jan 2024 12:00:00 +0000</pubDate>
164        <description>Test summary</description>
165    </item>
166</channel>
167</rss>"#;
168
169        let feed = parser::parse(rss.as_bytes()).unwrap();
170        assert_eq!(feed.entries.len(), 1);
171        assert_eq!(feed.entries[0].title.as_ref().unwrap().content, "Test Post");
172    }
173
174    #[test]
175    fn parse_invalid_rss() {
176        let invalid_rss = "this is not valid XML";
177        let result = parser::parse(invalid_rss.as_bytes());
178        assert!(result.is_err());
179    }
180
181    #[test]
182    fn parse_empty_rss() {
183        let rss = r#"<?xml version="1.0" encoding="UTF-8"?>
184<rss version="2.0">
185<channel>
186    <title>Test Feed</title>
187</channel>
188</rss>"#;
189
190        let feed = parser::parse(rss.as_bytes()).unwrap();
191        assert_eq!(feed.entries.len(), 0);
192    }
193}