forced evolution · anil.recoil.org/tuatara@a56c4a8

+13 -3

lib/core/tuatara_cmd.ml

··· 222 222 Tuatara_fetch.detect_feed_type ~content_type:result.content_type result.content) 223 223 | None -> Tuatara_fetch.detect_feed_type ~content_type:result.content_type result.content 224 224 in 225 - let metadata = Tuatara_fetch.get_feed_metadata ~feed_type:detected_type result.content in 225 + let metadata = Tuatara_fetch.get_feed_metadata ~feed_type:detected_type ~url result.content in 226 226 let title = match name with Some n -> Some n | None -> metadata.title in 227 227 let source = Source.make 228 228 ~url ··· 279 279 List.iter (fun source -> 280 280 let source_name = Option.value (Source.title source) ~default:(Source.url source) in 281 281 let source_url = Source.url source in 282 - let feed_type = Source.feed_type source in 283 282 Fmt.pr "Fetching %s...@." source_name; 284 283 let last_content = ref None in 285 284 try ··· 291 290 Fmt.pr " Not modified@." 292 291 | Some result -> 293 292 last_content := Some result.content; 293 + (* Re-detect feed type from content - this handles cases where the 294 + stored type was wrong or the feed format changed *) 295 + let feed_type = 296 + Tuatara_fetch.detect_feed_type 297 + ~content_type:result.content_type 298 + result.content 299 + in 294 300 let posts = Tuatara_fetch.parse_feed 295 301 ~source_id:(Source.id source) 296 302 ~feed_type 297 303 ~created_at:now 304 + ~url:source_url 298 305 result.content 299 306 in 300 307 let new_count = ref 0 in ··· 312 319 let source = Source.set_etag source result.etag in 313 320 let source = Source.set_last_modified source result.last_modified in 314 321 let metadata = Tuatara_fetch.get_feed_metadata 315 - ~feed_type:(Source.feed_type source) 322 + ~feed_type 323 + ~url:source_url 316 324 result.content 317 325 in 318 326 let source = match metadata.title with ··· 333 341 match !last_content with 334 342 | Some content -> 335 343 claude_invoked := true; 344 + let feed_type = Tuatara_fetch.detect_feed_type ~content_type:None content in 336 345 let _ = invoke_claude_fix ~sw ~process_mgr ~clock ~source_url ~feed_type ~error_msg:msg ~content in 337 346 () 338 347 | None -> () ··· 346 355 match !last_content with 347 356 | Some content -> 348 357 claude_invoked := true; 358 + let feed_type = Tuatara_fetch.detect_feed_type ~content_type:None content in 349 359 let _ = invoke_claude_fix ~sw ~process_mgr ~clock ~source_url ~feed_type ~error_msg:msg ~content in 350 360 () 351 361 | None -> ()

+168 -34

lib/core/tuatara_fetch.ml

··· 18 18 19 19 (* Detect feed type from content *) 20 20 let detect_feed_type ~content_type content = 21 - (* First try content-type header *) 22 - let from_header = 23 - match content_type with 24 - | Some ct -> 25 - let ct = String.lowercase_ascii ct in 26 - if String.length ct >= 16 && String.sub ct 0 16 = "application/json" then 27 - Some Source.Json 28 - else if String.length ct >= 21 && String.sub ct 0 21 = "application/feed+json" then 29 - Some Source.Json 30 - else if String.length ct >= 20 && String.sub ct 0 20 = "application/atom+xml" then 31 - Some Source.Atom 32 - else if String.length ct >= 19 && String.sub ct 0 19 = "application/rss+xml" then 33 - Some Source.Rss 34 - else if String.length ct >= 15 && String.sub ct 0 15 = "application/xml" then 35 - Some Source.Rss 36 - else if String.length ct >= 8 && String.sub ct 0 8 = "text/xml" then 37 - Some Source.Rss 38 - else 39 - None 40 - | None -> None 41 - in 42 - (* Refine by looking at content *) 43 - match from_header with 44 - | Some ft -> ft 45 - | None -> 21 + (* Helper to detect feed type from XML content *) 22 + let detect_from_content content = 46 23 let content = String.trim content in 47 24 if String.length content > 0 then 48 25 if content.[0] = '{' then 49 - Source.Json 26 + Some Source.Json 50 27 else if content.[0] = '<' then 51 28 (* XML - check for rss vs atom *) 52 29 let prefix = String.lowercase_ascii (String.sub content 0 (min 500 (String.length content))) in 53 30 (* Check for RSS first - RSS feeds may include atom namespace for self-links *) 54 31 if try let _ = Str.search_forward (Str.regexp "<rss") prefix 0 in true 55 32 with Not_found -> false then 56 - Source.Rss 33 + Some Source.Rss 57 34 else if try let _ = Str.search_forward (Str.regexp "<feed") prefix 0 in true 58 35 with Not_found -> false then 59 - Source.Atom 36 + Some Source.Atom 60 37 else if try let _ = Str.search_forward (Str.regexp "xmlns.*atom") prefix 0 in true 61 38 with Not_found -> false then 62 - Source.Atom 39 + Some Source.Atom 63 40 else 64 - Source.Rss 41 + Some Source.Rss 65 42 else 43 + None 44 + else 45 + None 46 + in 47 + (* First check content - this is more reliable than content-type headers, 48 + especially for feeds served through aggregators like FeedBurner which 49 + may serve RSS feeds with application/atom+xml content-type *) 50 + match detect_from_content content with 51 + | Some ft -> ft 52 + | None -> 53 + (* Fall back to content-type header *) 54 + match content_type with 55 + | Some ct -> 56 + let ct = String.lowercase_ascii ct in 57 + if String.length ct >= 16 && String.sub ct 0 16 = "application/json" then 58 + Source.Json 59 + else if String.length ct >= 21 && String.sub ct 0 21 = "application/feed+json" then 60 + Source.Json 61 + else if String.length ct >= 20 && String.sub ct 0 20 = "application/atom+xml" then 66 62 Source.Atom 67 - else 68 - Source.Atom 63 + else if String.length ct >= 19 && String.sub ct 0 19 = "application/rss+xml" then 64 + Source.Rss 65 + else if String.length ct >= 15 && String.sub ct 0 15 = "application/xml" then 66 + Source.Rss 67 + else if String.length ct >= 8 && String.sub ct 0 8 = "text/xml" then 68 + Source.Rss 69 + else 70 + Source.Atom (* Default fallback *) 71 + | None -> Source.Atom (* Default fallback *) 69 72 70 73 (* Fetch from local file *) 71 74 let fetch_file ~fs path = ··· 138 141 else 139 142 url 140 143 144 + (* Quirks for specific feeds that have non-standard formats *) 145 + module Quirks = struct 146 + let weekday_name = function 147 + | 0 -> "Sun" | 1 -> "Mon" | 2 -> "Tue" | 3 -> "Wed" 148 + | 4 -> "Thu" | 5 -> "Fri" | _ -> "Sat" 149 + 150 + let month_name = function 151 + | 1 -> "Jan" | 2 -> "Feb" | 3 -> "Mar" | 4 -> "Apr" 152 + | 5 -> "May" | 6 -> "Jun" | 7 -> "Jul" | 8 -> "Aug" 153 + | 9 -> "Sep" | 10 -> "Oct" | 11 -> "Nov" | _ -> "Dec" 154 + 155 + (* Convert ISO 8601 date to RFC 822 format. 156 + digitalflapjack.com uses ISO 8601 dates in RSS pubDate/lastBuildDate 157 + but RSS spec requires RFC 822 format. *) 158 + let iso8601_to_rfc822 iso = 159 + try 160 + (* Parse ISO 8601: 2025-10-22T12:24:00-00:00 or 2025-10-22T12:24:00Z *) 161 + let year = int_of_string (String.sub iso 0 4) in 162 + let month = int_of_string (String.sub iso 5 2) in 163 + let day = int_of_string (String.sub iso 8 2) in 164 + let hour = int_of_string (String.sub iso 11 2) in 165 + let min = int_of_string (String.sub iso 14 2) in 166 + let sec = int_of_string (String.sub iso 17 2) in 167 + (* Calculate day of week using Zeller's formula *) 168 + let m = if month < 3 then month + 12 else month in 169 + let y = if month < 3 then year - 1 else year in 170 + let dow = (day + (13 * (m + 1)) / 5 + y + y/4 - y/100 + y/400) mod 7 in 171 + let wday = weekday_name ((dow + 6) mod 7) in 172 + Printf.sprintf "%s, %02d %s %04d %02d:%02d:%02d GMT" 173 + wday day (month_name month) year hour min sec 174 + with _ -> iso (* Return unchanged if parsing fails *) 175 + 176 + (* digitalflapjack.com uses ISO 8601 dates in RSS which requires RFC 822 *) 177 + let fix_digitalflapjack_dates content = 178 + (* Match ISO 8601 dates in pubDate and lastBuildDate tags *) 179 + let re = Str.regexp "<\\(pubDate\\|lastBuildDate\\)>\\([0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]T[0-9][0-9]:[0-9][0-9]:[0-9][0-9]\\)[-+Z][^<]*</\\(pubDate\\|lastBuildDate\\)>" in 180 + Str.global_substitute re (fun s -> 181 + let tag = Str.matched_group 1 s in 182 + let iso_date = Str.matched_group 2 s in 183 + let rfc822 = iso8601_to_rfc822 iso_date in 184 + Printf.sprintf "<%s>%s</%s>" tag rfc822 tag 185 + ) content 186 + 187 + (* Fix empty subtitle tags in Atom feeds. 188 + The Syndic library fails with Not_found on empty <subtitle></subtitle> tags. 189 + This affects feeds like onkargulati.com and other Jekyll-generated feeds. *) 190 + let fix_empty_subtitle content = 191 + let re = Str.regexp "<subtitle[^>]*></subtitle>" in 192 + Str.global_replace re "" content 193 + 194 + (* Fix Atom entries missing <author> elements. 195 + Per RFC 4287, entries MUST have an author unless the feed has one. 196 + Syndic enforces this strictly, so we add a default author to entries 197 + that are missing one. This is a generic fix for many Atom feeds. *) 198 + let fix_missing_entry_authors content = 199 + (* Check if this looks like an Atom feed *) 200 + let is_atom = 201 + try let _ = Str.search_forward (Str.regexp "<feed[^>]*xmlns=['\"]http://www.w3.org/2005/Atom['\"]") content 0 in true 202 + with Not_found -> 203 + try let _ = Str.search_forward (Str.regexp "<feed") content 0 in true 204 + with Not_found -> false 205 + in 206 + if not is_atom then content 207 + else 208 + (* Check if feed has a top-level author *) 209 + let has_feed_author = 210 + try 211 + let feed_start = Str.search_forward (Str.regexp "<feed") content 0 in 212 + let first_entry = try Str.search_forward (Str.regexp "<entry") content feed_start with Not_found -> String.length content in 213 + let feed_section = String.sub content feed_start (first_entry - feed_start) in 214 + try let _ = Str.search_forward (Str.regexp "<author>") feed_section 0 in true 215 + with Not_found -> false 216 + with Not_found -> false 217 + in 218 + if has_feed_author then content 219 + else 220 + (* Add a default author to entries that don't have one *) 221 + let default_author = "<author><name>Unknown</name></author>" in 222 + (* Match entry blocks and check if they have an author *) 223 + let buf = Buffer.create (String.length content + 1000) in 224 + let pos = ref 0 in 225 + let entry_re = Str.regexp "<entry>" in 226 + let author_re = Str.regexp "<author>" in 227 + let end_entry_re = Str.regexp "</entry>" in 228 + while 229 + try 230 + let entry_start = Str.search_forward entry_re content !pos in 231 + let entry_end = Str.search_forward end_entry_re content entry_start in 232 + let entry_block = String.sub content entry_start (entry_end - entry_start) in 233 + (* Check if this entry has an author *) 234 + let has_author = 235 + try let _ = Str.search_forward author_re entry_block 0 in true 236 + with Not_found -> false 237 + in 238 + (* Add content before entry *) 239 + Buffer.add_substring buf content !pos (entry_start - !pos); 240 + if has_author then begin 241 + (* Entry has author, copy as-is *) 242 + Buffer.add_substring buf content entry_start (entry_end + 8 - entry_start); 243 + end else begin 244 + (* Entry lacks author, add default after <entry> tag *) 245 + Buffer.add_string buf "<entry>\n "; 246 + Buffer.add_string buf default_author; 247 + (* Add rest of entry content (skip the <entry> tag itself) *) 248 + Buffer.add_substring buf content (entry_start + 7) (entry_end + 8 - entry_start - 7); 249 + end; 250 + pos := entry_end + 8; 251 + true 252 + with Not_found -> false 253 + do () done; 254 + (* Add remaining content *) 255 + Buffer.add_substring buf content !pos (String.length content - !pos); 256 + Buffer.contents buf 257 + 258 + let apply_for_url url content = 259 + (* Apply domain-specific fixes *) 260 + let content = 261 + if String.length url >= 20 && 262 + try let _ = Str.search_forward (Str.regexp "digitalflapjack\\.com") url 0 in true 263 + with Not_found -> false 264 + then fix_digitalflapjack_dates content 265 + else content 266 + in 267 + (* Apply generic Atom fixes - empty subtitle tags cause Syndic to fail *) 268 + let content = fix_empty_subtitle content in 269 + (* Fix Atom entries missing author elements *) 270 + fix_missing_entry_authors content 271 + end 272 + 141 273 (* Main fetch function *) 142 274 let fetch ~sw ~clock ~net ~fs url ?etag ?if_modified_since () = 143 275 if is_local_path url then ··· 258 390 ?published ?updated ?author ~tags ~created_at () 259 391 260 392 (* Parse feed content and return posts *) 261 - let parse_feed ~source_id ~feed_type ~created_at content = 393 + let parse_feed ~source_id ~feed_type ~created_at ~url content = 394 + let content = Quirks.apply_for_url url content in 262 395 try 263 396 match feed_type with 264 397 | Source.Atom -> ··· 294 427 description : string option; 295 428 } 296 429 297 - let get_feed_metadata ~feed_type content = 430 + let get_feed_metadata ~feed_type ~url content = 431 + let content = Quirks.apply_for_url url content in 298 432 try 299 433 match feed_type with 300 434 | Source.Atom ->

+2 -1

lib/core/tuatara_fetch.mli

··· 36 36 source_id:string -> 37 37 feed_type:Source.feed_type -> 38 38 created_at:Ptime.t -> 39 + url:string -> 39 40 string -> Post.t list 40 41 41 42 (** Feed metadata extracted from feed *) ··· 46 47 } 47 48 48 49 (** Extract metadata from feed content *) 49 - val get_feed_metadata : feed_type:Source.feed_type -> string -> feed_metadata 50 + val get_feed_metadata : feed_type:Source.feed_type -> url:string -> string -> feed_metadata

Configure Feed

Configure Feed