personal activity index (bluesky, leaflet, substack)
pai.desertthunder.dev
rss
bluesky
1use crate::{Item, PaiError, Result, SourceFetcher, SourceKind, Storage, SubstackConfig};
2use chrono::Utc;
3use feed_rs::parser;
4use tokio::runtime::Runtime;
5
6/// Fetcher for Substack RSS feeds
7///
8/// Retrieves posts from a Substack publication by parsing its RSS feed.
9/// Maps RSS items to the standardized Item struct for storage.
10pub struct SubstackFetcher {
11 config: SubstackConfig,
12 client: reqwest::Client,
13}
14
15impl SubstackFetcher {
16 /// Creates a new Substack fetcher with the given configuration
17 pub fn new(config: SubstackConfig) -> Self {
18 Self { config, client: reqwest::Client::new() }
19 }
20
21 /// Fetches and parses the RSS feed
22 async fn fetch_feed(&self) -> Result<feed_rs::model::Feed> {
23 let feed_url = format!("{}/feed", self.config.base_url);
24 let response = self
25 .client
26 .get(&feed_url)
27 .send()
28 .await
29 .map_err(|e| PaiError::Fetch(format!("Failed to fetch RSS feed: {e}")))?;
30
31 let body = response
32 .text()
33 .await
34 .map_err(|e| PaiError::Fetch(format!("Failed to read response body: {e}")))?;
35
36 parser::parse(body.as_bytes()).map_err(|e| PaiError::Parse(format!("Failed to parse RSS feed: {e}")))
37 }
38
39 /// Extracts the source ID from the base URL (e.g., "patternmatched.substack.com")
40 fn extract_source_id(&self) -> String {
41 Self::normalize_source_id(&self.config.base_url)
42 }
43
44 pub(crate) fn normalize_source_id(base_url: &str) -> String {
45 base_url
46 .trim_start_matches("https://")
47 .trim_start_matches("http://")
48 .trim_end_matches('/')
49 .to_string()
50 }
51}
52
53impl SourceFetcher for SubstackFetcher {
54 fn sync(&self, storage: &dyn Storage) -> Result<()> {
55 let runtime = Runtime::new().map_err(|e| PaiError::Fetch(format!("Failed to create runtime: {e}")))?;
56
57 runtime.block_on(async {
58 let feed = self.fetch_feed().await?;
59 let source_id = self.extract_source_id();
60
61 for entry in feed.entries {
62 let id = entry.id.clone();
63 let url = entry
64 .links
65 .first()
66 .map(|link| link.href.clone())
67 .unwrap_or_else(|| id.clone());
68
69 let title = entry.title.as_ref().map(|t| t.content.clone());
70 let summary = entry.summary.as_ref().map(|s| s.content.clone());
71 let author = entry.authors.first().map(|a| a.name.clone());
72 let content_html = entry.content.and_then(|c| c.body);
73
74 let published_at = entry
75 .published
76 .or(entry.updated)
77 .map(|dt| dt.to_rfc3339())
78 .unwrap_or_else(|| Utc::now().to_rfc3339());
79
80 let item = Item {
81 id,
82 source_kind: SourceKind::Substack,
83 source_id: source_id.clone(),
84 author,
85 title,
86 summary,
87 url,
88 content_html,
89 published_at,
90 created_at: Utc::now().to_rfc3339(),
91 };
92
93 storage.insert_or_replace_item(&item)?;
94 }
95
96 Ok(())
97 })
98 }
99}
100
101#[cfg(test)]
102mod tests {
103 use super::*;
104 use crate::ListFilter;
105 use std::sync::{Arc, Mutex};
106
107 #[derive(Clone)]
108 #[allow(dead_code)]
109 struct MockStorage {
110 items: Arc<Mutex<Vec<Item>>>,
111 }
112
113 #[allow(dead_code)]
114 impl MockStorage {
115 fn new() -> Self {
116 Self { items: Arc::new(Mutex::new(Vec::new())) }
117 }
118
119 fn get_items(&self) -> Vec<Item> {
120 self.items.lock().unwrap().clone()
121 }
122 }
123
124 impl Storage for MockStorage {
125 fn insert_or_replace_item(&self, item: &Item) -> Result<()> {
126 self.items.lock().unwrap().push(item.clone());
127 Ok(())
128 }
129
130 fn list_items(&self, _filter: &ListFilter) -> Result<Vec<Item>> {
131 Ok(self.items.lock().unwrap().clone())
132 }
133 }
134
135 #[test]
136 fn extract_source_id_https() {
137 assert_eq!(
138 SubstackFetcher::normalize_source_id("https://patternmatched.substack.com"),
139 "patternmatched.substack.com"
140 );
141 }
142
143 #[test]
144 fn extract_source_id_http() {
145 assert_eq!(
146 SubstackFetcher::normalize_source_id("http://test.substack.com/"),
147 "test.substack.com"
148 );
149 }
150
151 #[test]
152 fn parse_valid_rss() {
153 let rss = r#"<?xml version="1.0" encoding="UTF-8"?>
154<rss version="2.0">
155<channel>
156 <title>Test Feed</title>
157 <link>https://test.substack.com</link>
158 <description>Test</description>
159 <item>
160 <title>Test Post</title>
161 <link>https://test.substack.com/p/test-post</link>
162 <guid>test-guid</guid>
163 <pubDate>Mon, 01 Jan 2024 12:00:00 +0000</pubDate>
164 <description>Test summary</description>
165 </item>
166</channel>
167</rss>"#;
168
169 let feed = parser::parse(rss.as_bytes()).unwrap();
170 assert_eq!(feed.entries.len(), 1);
171 assert_eq!(feed.entries[0].title.as_ref().unwrap().content, "Test Post");
172 }
173
174 #[test]
175 fn parse_invalid_rss() {
176 let invalid_rss = "this is not valid XML";
177 let result = parser::parse(invalid_rss.as_bytes());
178 assert!(result.is_err());
179 }
180
181 #[test]
182 fn parse_empty_rss() {
183 let rss = r#"<?xml version="1.0" encoding="UTF-8"?>
184<rss version="2.0">
185<channel>
186 <title>Test Feed</title>
187</channel>
188</rss>"#;
189
190 let feed = parser::parse(rss.as_bytes()).unwrap();
191 assert_eq!(feed.entries.len(), 0);
192 }
193}