src/services/feed-discovery.ts at main

gui.do / sifa-api
fork atom
Sifa professional network API (Fastify, AT Protocol, Jetstream) sifa.id/
fork atom
sifa-api / src / services / feed-discovery.ts
at main 200 lines 5.5 kB view raw
wrap content
Guido X Jansen fix(security): address CodeQL alerts for URL sanitization and workflow permissions 5d ago
19198bf4
  1import { logger } from '../logger.js';
  2
  3const FETCH_TIMEOUT = 10000;
  4
  5export async function discoverFeedUrl(platform: string, url: string): Promise<string | null> {
  6  try {
  7    if (platform === 'youtube') {
  8      return discoverYoutubeFeed(url);
  9    }
 10    if (platform === 'fediverse') {
 11      return discoverFediverseFeed(url);
 12    }
 13    if (platform === 'rss') {
 14      return url;
 15    }
 16    if (platform === 'website') {
 17      return discoverRssFeed(url);
 18    }
 19    return null;
 20  } catch (err) {
 21    logger.warn({ err, platform, url }, 'Feed discovery failed');
 22    return null;
 23  }
 24}
 25
 26async function discoverYoutubeFeed(url: string): Promise<string | null> {
 27  try {
 28    const parsed = new URL(url);
 29    if (
 30      !parsed.hostname.endsWith('.youtube.com') &&
 31      parsed.hostname !== 'youtube.com' &&
 32      !parsed.hostname.endsWith('.youtu.be') &&
 33      parsed.hostname !== 'youtu.be'
 34    ) {
 35      return null;
 36    }
 37
 38    // Direct channel ID URL: /channel/UC...
 39    const channelMatch = parsed.pathname.match(/\/channel\/(UC[\w-]+)/);
 40    if (channelMatch?.[1]) {
 41      return `https://www.youtube.com/feeds/videos.xml?channel_id=${channelMatch[1]}`;
 42    }
 43
 44    // For /@handle or /c/name URLs, fetch the page to extract the channel ID
 45    const response = await fetch(url, {
 46      signal: AbortSignal.timeout(FETCH_TIMEOUT),
 47      headers: { 'User-Agent': 'Sifa/1.0 (+https://sifa.id)' },
 48    });
 49    if (!response.ok) return null;
 50
 51    const html = await response.text();
 52    const idMatch = html.match(/channel_id=([A-Za-z0-9_-]+)/);
 53    if (idMatch?.[1]) {
 54      return `https://www.youtube.com/feeds/videos.xml?channel_id=${idMatch[1]}`;
 55    }
 56
 57    return null;
 58  } catch {
 59    return null;
 60  }
 61}
 62
 63function discoverFediverseFeed(url: string): string | null {
 64  try {
 65    const parsed = new URL(url);
 66    const pathParts = parsed.pathname.split('/').filter(Boolean);
 67    const username = pathParts.find((p) => p.startsWith('@'));
 68    if (username) {
 69      return `${parsed.origin}/${username}.rss`;
 70    }
 71    return null;
 72  } catch {
 73    return null;
 74  }
 75}
 76
 77async function discoverRssFeed(url: string): Promise<string | null> {
 78  try {
 79    const response = await fetch(url, {
 80      signal: AbortSignal.timeout(FETCH_TIMEOUT),
 81      headers: { 'User-Agent': 'Sifa/1.0 (+https://sifa.id)' },
 82    });
 83
 84    if (!response.ok) return null;
 85
 86    const contentType = response.headers.get('content-type') ?? '';
 87    if (
 88      contentType.includes('xml') ||
 89      contentType.includes('rss') ||
 90      contentType.includes('atom')
 91    ) {
 92      return url;
 93    }
 94
 95    const html = await response.text();
 96    const linkMatch = html.match(/<link[^>]+type=["']application\/(rss|atom)\+xml["'][^>]*>/i);
 97    if (!linkMatch) return null;
 98
 99    const hrefMatch = linkMatch[0].match(/href=["']([^"']+)["']/i);
100    if (!hrefMatch) return null;
101
102    const feedHref = hrefMatch[1];
103    if (!feedHref) return null;
104
105    try {
106      return new URL(feedHref, url).toString();
107    } catch {
108      return null;
109    }
110  } catch {
111    return null;
112  }
113}
114
115export interface FeedItem {
116  title: string;
117  excerpt: string;
118  url: string;
119  timestamp: string;
120  source: string;
121}
122
123export async function fetchFeedItems(feedUrl: string, source: string): Promise<FeedItem[]> {
124  try {
125    const response = await fetch(feedUrl, {
126      signal: AbortSignal.timeout(FETCH_TIMEOUT),
127      headers: { 'User-Agent': 'Sifa/1.0 (+https://sifa.id)' },
128    });
129
130    if (!response.ok) return [];
131
132    const text = await response.text();
133    return parseRssFeed(text, source);
134  } catch (err) {
135    logger.warn({ err, feedUrl }, 'Failed to fetch feed');
136    return [];
137  }
138}
139
140function parseRssFeed(xml: string, source: string): FeedItem[] {
141  const items: FeedItem[] = [];
142
143  const itemMatches =
144    xml.match(/<item[\s>][\s\S]*?<\/item>/gi) ?? xml.match(/<entry[\s>][\s\S]*?<\/entry>/gi) ?? [];
145
146  for (const itemXml of itemMatches.slice(0, 20)) {
147    const title = extractTag(itemXml, 'title') ?? '';
148    const link = extractLink(itemXml);
149    const description =
150      extractTag(itemXml, 'description') ??
151      extractTag(itemXml, 'summary') ??
152      extractTag(itemXml, 'content') ??
153      '';
154    const pubDate =
155      extractTag(itemXml, 'pubDate') ??
156      extractTag(itemXml, 'published') ??
157      extractTag(itemXml, 'updated') ??
158      '';
159
160    let plainDesc = description;
161    let prev = '';
162    while (prev !== plainDesc) {
163      prev = plainDesc;
164      plainDesc = plainDesc.replace(/<[^>]+>/g, '');
165    }
166    plainDesc = plainDesc.trim();
167    const excerpt = plainDesc.length > 200 ? plainDesc.slice(0, 200) + '...' : plainDesc;
168
169    if (title || link) {
170      items.push({
171        title: title.replace(/<!\[CDATA\[(.*?)\]\]>/g, '$1').trim(),
172        excerpt,
173        url: link ?? '',
174        timestamp: pubDate ? new Date(pubDate).toISOString() : '',
175        source,
176      });
177    }
178  }
179
180  return items;
181}
182
183function extractTag(xml: string, tag: string): string | null {
184  const match = xml.match(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)<\\/${tag}>`, 'i'));
185  if (!match) return null;
186  const content = match[1];
187  if (!content) return null;
188  return content.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, '$1').trim();
189}
190
191function extractLink(xml: string): string | null {
192  const linkTag = xml.match(/<link[^>]+href=["']([^"']+)["'][^>]*\/?>/i);
193  if (linkTag) {
194    const href = linkTag[1];
195    return href ?? null;
196  }
197
198  const linkContent = extractTag(xml, 'link');
199  return linkContent;
200}