···222222 Tuatara_fetch.detect_feed_type ~content_type:result.content_type result.content)
223223 | None -> Tuatara_fetch.detect_feed_type ~content_type:result.content_type result.content
224224 in
225225- let metadata = Tuatara_fetch.get_feed_metadata ~feed_type:detected_type result.content in
225225+ let metadata = Tuatara_fetch.get_feed_metadata ~feed_type:detected_type ~url result.content in
226226 let title = match name with Some n -> Some n | None -> metadata.title in
227227 let source = Source.make
228228 ~url
···279279 List.iter (fun source ->
280280 let source_name = Option.value (Source.title source) ~default:(Source.url source) in
281281 let source_url = Source.url source in
282282- let feed_type = Source.feed_type source in
283282 Fmt.pr "Fetching %s...@." source_name;
284283 let last_content = ref None in
285284 try
···291290 Fmt.pr " Not modified@."
292291 | Some result ->
293292 last_content := Some result.content;
293293+ (* Re-detect feed type from content - this handles cases where the
294294+ stored type was wrong or the feed format changed *)
295295+ let feed_type =
296296+ Tuatara_fetch.detect_feed_type
297297+ ~content_type:result.content_type
298298+ result.content
299299+ in
294300 let posts = Tuatara_fetch.parse_feed
295301 ~source_id:(Source.id source)
296302 ~feed_type
297303 ~created_at:now
304304+ ~url:source_url
298305 result.content
299306 in
300307 let new_count = ref 0 in
···312319 let source = Source.set_etag source result.etag in
313320 let source = Source.set_last_modified source result.last_modified in
314321 let metadata = Tuatara_fetch.get_feed_metadata
315315- ~feed_type:(Source.feed_type source)
322322+ ~feed_type
323323+ ~url:source_url
316324 result.content
317325 in
318326 let source = match metadata.title with
···333341 match !last_content with
334342 | Some content ->
335343 claude_invoked := true;
344344+ let feed_type = Tuatara_fetch.detect_feed_type ~content_type:None content in
336345 let _ = invoke_claude_fix ~sw ~process_mgr ~clock ~source_url ~feed_type ~error_msg:msg ~content in
337346 ()
338347 | None -> ()
···346355 match !last_content with
347356 | Some content ->
348357 claude_invoked := true;
358358+ let feed_type = Tuatara_fetch.detect_feed_type ~content_type:None content in
349359 let _ = invoke_claude_fix ~sw ~process_mgr ~clock ~source_url ~feed_type ~error_msg:msg ~content in
350360 ()
351361 | None -> ()
+168-34
lib/core/tuatara_fetch.ml
···18181919(* Detect feed type from content *)
2020let detect_feed_type ~content_type content =
2121- (* First try content-type header *)
2222- let from_header =
2323- match content_type with
2424- | Some ct ->
2525- let ct = String.lowercase_ascii ct in
2626- if String.length ct >= 16 && String.sub ct 0 16 = "application/json" then
2727- Some Source.Json
2828- else if String.length ct >= 21 && String.sub ct 0 21 = "application/feed+json" then
2929- Some Source.Json
3030- else if String.length ct >= 20 && String.sub ct 0 20 = "application/atom+xml" then
3131- Some Source.Atom
3232- else if String.length ct >= 19 && String.sub ct 0 19 = "application/rss+xml" then
3333- Some Source.Rss
3434- else if String.length ct >= 15 && String.sub ct 0 15 = "application/xml" then
3535- Some Source.Rss
3636- else if String.length ct >= 8 && String.sub ct 0 8 = "text/xml" then
3737- Some Source.Rss
3838- else
3939- None
4040- | None -> None
4141- in
4242- (* Refine by looking at content *)
4343- match from_header with
4444- | Some ft -> ft
4545- | None ->
2121+ (* Helper to detect feed type from XML content *)
2222+ let detect_from_content content =
4623 let content = String.trim content in
4724 if String.length content > 0 then
4825 if content.[0] = '{' then
4949- Source.Json
2626+ Some Source.Json
5027 else if content.[0] = '<' then
5128 (* XML - check for rss vs atom *)
5229 let prefix = String.lowercase_ascii (String.sub content 0 (min 500 (String.length content))) in
5330 (* Check for RSS first - RSS feeds may include atom namespace for self-links *)
5431 if try let _ = Str.search_forward (Str.regexp "<rss") prefix 0 in true
5532 with Not_found -> false then
5656- Source.Rss
3333+ Some Source.Rss
5734 else if try let _ = Str.search_forward (Str.regexp "<feed") prefix 0 in true
5835 with Not_found -> false then
5959- Source.Atom
3636+ Some Source.Atom
6037 else if try let _ = Str.search_forward (Str.regexp "xmlns.*atom") prefix 0 in true
6138 with Not_found -> false then
6262- Source.Atom
3939+ Some Source.Atom
6340 else
6464- Source.Rss
4141+ Some Source.Rss
6542 else
4343+ None
4444+ else
4545+ None
4646+ in
4747+ (* First check content - this is more reliable than content-type headers,
4848+ especially for feeds served through aggregators like FeedBurner which
4949+ may serve RSS feeds with application/atom+xml content-type *)
5050+ match detect_from_content content with
5151+ | Some ft -> ft
5252+ | None ->
5353+ (* Fall back to content-type header *)
5454+ match content_type with
5555+ | Some ct ->
5656+ let ct = String.lowercase_ascii ct in
5757+ if String.length ct >= 16 && String.sub ct 0 16 = "application/json" then
5858+ Source.Json
5959+ else if String.length ct >= 21 && String.sub ct 0 21 = "application/feed+json" then
6060+ Source.Json
6161+ else if String.length ct >= 20 && String.sub ct 0 20 = "application/atom+xml" then
6662 Source.Atom
6767- else
6868- Source.Atom
6363+ else if String.length ct >= 19 && String.sub ct 0 19 = "application/rss+xml" then
6464+ Source.Rss
6565+ else if String.length ct >= 15 && String.sub ct 0 15 = "application/xml" then
6666+ Source.Rss
6767+ else if String.length ct >= 8 && String.sub ct 0 8 = "text/xml" then
6868+ Source.Rss
6969+ else
7070+ Source.Atom (* Default fallback *)
7171+ | None -> Source.Atom (* Default fallback *)
69727073(* Fetch from local file *)
7174let fetch_file ~fs path =
···138141 else
139142 url
140143144144+(* Quirks for specific feeds that have non-standard formats *)
145145+module Quirks = struct
146146+ let weekday_name = function
147147+ | 0 -> "Sun" | 1 -> "Mon" | 2 -> "Tue" | 3 -> "Wed"
148148+ | 4 -> "Thu" | 5 -> "Fri" | _ -> "Sat"
149149+150150+ let month_name = function
151151+ | 1 -> "Jan" | 2 -> "Feb" | 3 -> "Mar" | 4 -> "Apr"
152152+ | 5 -> "May" | 6 -> "Jun" | 7 -> "Jul" | 8 -> "Aug"
153153+ | 9 -> "Sep" | 10 -> "Oct" | 11 -> "Nov" | _ -> "Dec"
154154+155155+ (* Convert ISO 8601 date to RFC 822 format.
156156+ digitalflapjack.com uses ISO 8601 dates in RSS pubDate/lastBuildDate
157157+ but RSS spec requires RFC 822 format. *)
158158+ let iso8601_to_rfc822 iso =
159159+ try
160160+ (* Parse ISO 8601: 2025-10-22T12:24:00-00:00 or 2025-10-22T12:24:00Z *)
161161+ let year = int_of_string (String.sub iso 0 4) in
162162+ let month = int_of_string (String.sub iso 5 2) in
163163+ let day = int_of_string (String.sub iso 8 2) in
164164+ let hour = int_of_string (String.sub iso 11 2) in
165165+ let min = int_of_string (String.sub iso 14 2) in
166166+ let sec = int_of_string (String.sub iso 17 2) in
167167+ (* Calculate day of week using Zeller's formula *)
168168+ let m = if month < 3 then month + 12 else month in
169169+ let y = if month < 3 then year - 1 else year in
170170+ let dow = (day + (13 * (m + 1)) / 5 + y + y/4 - y/100 + y/400) mod 7 in
171171+ let wday = weekday_name ((dow + 6) mod 7) in
172172+ Printf.sprintf "%s, %02d %s %04d %02d:%02d:%02d GMT"
173173+ wday day (month_name month) year hour min sec
174174+ with _ -> iso (* Return unchanged if parsing fails *)
175175+176176+ (* digitalflapjack.com uses ISO 8601 dates in RSS which requires RFC 822 *)
177177+ let fix_digitalflapjack_dates content =
178178+ (* Match ISO 8601 dates in pubDate and lastBuildDate tags *)
179179+ let re = Str.regexp "<\\(pubDate\\|lastBuildDate\\)>\\([0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]T[0-9][0-9]:[0-9][0-9]:[0-9][0-9]\\)[-+Z][^<]*</\\(pubDate\\|lastBuildDate\\)>" in
180180+ Str.global_substitute re (fun s ->
181181+ let tag = Str.matched_group 1 s in
182182+ let iso_date = Str.matched_group 2 s in
183183+ let rfc822 = iso8601_to_rfc822 iso_date in
184184+ Printf.sprintf "<%s>%s</%s>" tag rfc822 tag
185185+ ) content
186186+187187+ (* Fix empty subtitle tags in Atom feeds.
188188+ The Syndic library fails with Not_found on empty <subtitle></subtitle> tags.
189189+ This affects feeds like onkargulati.com and other Jekyll-generated feeds. *)
190190+ let fix_empty_subtitle content =
191191+ let re = Str.regexp "<subtitle[^>]*></subtitle>" in
192192+ Str.global_replace re "" content
193193+194194+ (* Fix Atom entries missing <author> elements.
195195+ Per RFC 4287, entries MUST have an author unless the feed has one.
196196+ Syndic enforces this strictly, so we add a default author to entries
197197+ that are missing one. This is a generic fix for many Atom feeds. *)
198198+ let fix_missing_entry_authors content =
199199+ (* Check if this looks like an Atom feed *)
200200+ let is_atom =
201201+ try let _ = Str.search_forward (Str.regexp "<feed[^>]*xmlns=['\"]http://www.w3.org/2005/Atom['\"]") content 0 in true
202202+ with Not_found ->
203203+ try let _ = Str.search_forward (Str.regexp "<feed") content 0 in true
204204+ with Not_found -> false
205205+ in
206206+ if not is_atom then content
207207+ else
208208+ (* Check if feed has a top-level author *)
209209+ let has_feed_author =
210210+ try
211211+ let feed_start = Str.search_forward (Str.regexp "<feed") content 0 in
212212+ let first_entry = try Str.search_forward (Str.regexp "<entry") content feed_start with Not_found -> String.length content in
213213+ let feed_section = String.sub content feed_start (first_entry - feed_start) in
214214+ try let _ = Str.search_forward (Str.regexp "<author>") feed_section 0 in true
215215+ with Not_found -> false
216216+ with Not_found -> false
217217+ in
218218+ if has_feed_author then content
219219+ else
220220+ (* Add a default author to entries that don't have one *)
221221+ let default_author = "<author><name>Unknown</name></author>" in
222222+ (* Match entry blocks and check if they have an author *)
223223+ let buf = Buffer.create (String.length content + 1000) in
224224+ let pos = ref 0 in
225225+ let entry_re = Str.regexp "<entry>" in
226226+ let author_re = Str.regexp "<author>" in
227227+ let end_entry_re = Str.regexp "</entry>" in
228228+ while
229229+ try
230230+ let entry_start = Str.search_forward entry_re content !pos in
231231+ let entry_end = Str.search_forward end_entry_re content entry_start in
232232+ let entry_block = String.sub content entry_start (entry_end - entry_start) in
233233+ (* Check if this entry has an author *)
234234+ let has_author =
235235+ try let _ = Str.search_forward author_re entry_block 0 in true
236236+ with Not_found -> false
237237+ in
238238+ (* Add content before entry *)
239239+ Buffer.add_substring buf content !pos (entry_start - !pos);
240240+ if has_author then begin
241241+ (* Entry has author, copy as-is *)
242242+ Buffer.add_substring buf content entry_start (entry_end + 8 - entry_start);
243243+ end else begin
244244+ (* Entry lacks author, add default after <entry> tag *)
245245+ Buffer.add_string buf "<entry>\n ";
246246+ Buffer.add_string buf default_author;
247247+ (* Add rest of entry content (skip the <entry> tag itself) *)
248248+ Buffer.add_substring buf content (entry_start + 7) (entry_end + 8 - entry_start - 7);
249249+ end;
250250+ pos := entry_end + 8;
251251+ true
252252+ with Not_found -> false
253253+ do () done;
254254+ (* Add remaining content *)
255255+ Buffer.add_substring buf content !pos (String.length content - !pos);
256256+ Buffer.contents buf
257257+258258+ let apply_for_url url content =
259259+ (* Apply domain-specific fixes *)
260260+ let content =
261261+ if String.length url >= 20 &&
262262+ try let _ = Str.search_forward (Str.regexp "digitalflapjack\\.com") url 0 in true
263263+ with Not_found -> false
264264+ then fix_digitalflapjack_dates content
265265+ else content
266266+ in
267267+ (* Apply generic Atom fixes - empty subtitle tags cause Syndic to fail *)
268268+ let content = fix_empty_subtitle content in
269269+ (* Fix Atom entries missing author elements *)
270270+ fix_missing_entry_authors content
271271+end
272272+141273(* Main fetch function *)
142274let fetch ~sw ~clock ~net ~fs url ?etag ?if_modified_since () =
143275 if is_local_path url then
···258390 ?published ?updated ?author ~tags ~created_at ()
259391260392(* Parse feed content and return posts *)
261261-let parse_feed ~source_id ~feed_type ~created_at content =
393393+let parse_feed ~source_id ~feed_type ~created_at ~url content =
394394+ let content = Quirks.apply_for_url url content in
262395 try
263396 match feed_type with
264397 | Source.Atom ->
···294427 description : string option;
295428}
296429297297-let get_feed_metadata ~feed_type content =
430430+let get_feed_metadata ~feed_type ~url content =
431431+ let content = Quirks.apply_for_url url content in
298432 try
299433 match feed_type with
300434 | Source.Atom ->