so much evolution · anil.recoil.org/tuatara@7f29b37

+2

dune-project

··· 38 38 requests 39 39 xmlm 40 40 sortal 41 + html5rw 42 + bytesrw 41 43 (odoc :with-doc) 42 44 (alcotest (and :with-test (>= 1.7.0)))))

+3 -1

lib/core/dune

··· 24 24 digestif 25 25 requests 26 26 str 27 - xmlm)) 27 + xmlm 28 + html5rw 29 + bytesrw))

+4 -82

lib/core/tuatara_fetch.ml

··· 255 255 Buffer.add_substring buf content !pos (String.length content - !pos); 256 256 Buffer.contents buf 257 257 258 - (* Fix Quarto-generated RSS feeds where description contains full HTML content. 259 - Quarto puts the entire post in <description> instead of using <content:encoded>. 260 - We extract the first paragraph as the summary and move content appropriately. *) 261 - let fix_quarto_descriptions content = 262 - (* Check if this is a Quarto-generated feed *) 263 - let is_quarto = 264 - try let _ = Str.search_forward (Str.regexp "<generator>quarto") content 0 in true 265 - with Not_found -> false 266 - in 267 - if not is_quarto then content 268 - else 269 - (* For each item, find description CDATA and extract first paragraph *) 270 - let desc_start_re = Str.regexp "<description><!\\[CDATA\\[" in 271 - let desc_end_re = Str.regexp "\\]\\]></description>" in 272 - let buf = Buffer.create (String.length content) in 273 - let pos = ref 0 in 274 - while 275 - try 276 - let start_pos = Str.search_forward desc_start_re content !pos in 277 - let cdata_start = Str.match_end () in 278 - let end_pos = Str.search_forward desc_end_re content cdata_start in 279 - let full_desc = String.sub content cdata_start (end_pos - cdata_start) in 280 - (* Only process if description looks like full HTML (has section/h2 tags) *) 281 - let is_full_html = 282 - try let _ = Str.search_forward (Str.regexp "<section\\|<h2\\|<h1") full_desc 0 in true 283 - with Not_found -> false 284 - in 285 - if is_full_html then begin 286 - (* Extract first paragraph text, stripping HTML *) 287 - let first_para = 288 - try 289 - let _p_start = Str.search_forward (Str.regexp "<p[^>]*>") full_desc 0 in 290 - let p_content_start = Str.match_end () in 291 - let p_end = Str.search_forward (Str.regexp "</p>") full_desc p_content_start in 292 - let para = String.sub full_desc p_content_start (p_end - p_content_start) in 293 - (* Strip HTML tags from paragraph *) 294 - let stripped = Str.global_replace (Str.regexp "<[^>]*>") "" para in 295 - (* Decode common HTML entities *) 296 - let stripped = Str.global_replace (Str.regexp "&") "&" stripped in 297 - let stripped = Str.global_replace (Str.regexp "<") "<" stripped in 298 - let stripped = Str.global_replace (Str.regexp ">") ">" stripped in 299 - let stripped = Str.global_replace (Str.regexp """) "\"" stripped in 300 - let stripped = Str.global_replace (Str.regexp "'") "'" stripped in 301 - let stripped = Str.global_replace (Str.regexp " ") " " stripped in 302 - (* Truncate if too long *) 303 - let max_len = 300 in 304 - if String.length stripped > max_len then 305 - String.sub stripped 0 max_len ^ "..." 306 - else 307 - stripped 308 - with Not_found -> "" 309 - in 310 - if String.length first_para > 0 then begin 311 - (* Add content before this description *) 312 - Buffer.add_substring buf content !pos (start_pos - !pos); 313 - (* Add short summary as description, full content as content:encoded *) 314 - Buffer.add_string buf "<description>"; 315 - Buffer.add_string buf first_para; 316 - Buffer.add_string buf "</description>\n <content:encoded><![CDATA["; 317 - Buffer.add_string buf full_desc; 318 - Buffer.add_string buf "]]></content:encoded>"; 319 - pos := end_pos + 14 (* length of "]]></description>" *) 320 - end else begin 321 - (* No paragraph found, keep as-is *) 322 - Buffer.add_substring buf content !pos (end_pos + 14 - !pos); 323 - pos := end_pos + 14 324 - end 325 - end else begin 326 - (* Not full HTML, keep as-is *) 327 - Buffer.add_substring buf content !pos (end_pos + 14 - !pos); 328 - pos := end_pos + 14 329 - end; 330 - true 331 - with Not_found -> false 332 - do () done; 333 - Buffer.add_substring buf content !pos (String.length content - !pos); 334 - Buffer.contents buf 335 - 336 258 let apply_for_url url content = 337 - (* Apply domain-specific fixes *) 259 + (* Apply domain-specific fixes needed for parsing *) 338 260 let content = 339 261 if String.length url >= 20 && 340 262 try let _ = Str.search_forward (Str.regexp "digitalflapjack\\.com") url 0 in true ··· 345 267 (* Apply generic Atom fixes - empty subtitle tags cause Syndic to fail *) 346 268 let content = fix_empty_subtitle content in 347 269 (* Fix Atom entries missing author elements *) 348 - let content = fix_missing_entry_authors content in 349 - (* Fix Quarto RSS feeds with full content in description *) 350 - fix_quarto_descriptions content 270 + fix_missing_entry_authors content 271 + (* Note: Summary extraction from full HTML content is now done at output time 272 + in tuatara_output.ml, not here. This preserves original content in the database. *) 351 273 end 352 274 353 275 (* Main fetch function *)

+97 -2

lib/core/tuatara_output.ml

··· 6 6 module Post = Tuatara_schema.Post 7 7 module Config = Tuatara_schema.Config 8 8 9 + (* Check if a node is inside a code block *) 10 + let is_in_code_block node = 11 + let ancestors = Html5rw.ancestors node in 12 + List.exists (fun anc -> 13 + match Html5rw.get_attr anc "class" with 14 + | Some cls -> 15 + let words = String.split_on_char ' ' cls in 16 + List.mem "cell" words || List.mem "code" words || 17 + List.mem "sourceCode" words || List.mem "code-fold" words 18 + | None -> false 19 + ) ancestors 20 + 21 + (* Extract first meaningful paragraph from HTML content using html5rw. 22 + Used at output time to create short summaries from full HTML content. 23 + Falls back to heading text if no paragraph is found (e.g., for code notebooks). *) 24 + let extract_first_paragraph html = 25 + try 26 + let ctx = Html5rw.make_fragment_context ~tag_name:"div" () in 27 + let reader = Bytesrw.Bytes.Reader.of_string html in 28 + let result = Html5rw.parse ~fragment_context:ctx reader in 29 + 30 + let normalize_and_truncate text = 31 + let text = Str.global_replace (Str.regexp "[ \t\n\r]+") " " text in 32 + let text = String.trim text in 33 + let max_len = 300 in 34 + if String.length text > max_len then 35 + Some (String.sub text 0 max_len ^ "...") 36 + else if String.length text > 0 then 37 + Some text 38 + else 39 + None 40 + in 41 + 42 + (* First try: find first <p> that's not inside a code block *) 43 + let paragraphs = Html5rw.query result "p" in 44 + let first_para = List.find_opt (fun p -> not (is_in_code_block p)) paragraphs in 45 + match first_para with 46 + | Some p -> 47 + let text = Html5rw.get_text_content p in 48 + normalize_and_truncate text 49 + | None -> 50 + (* Fallback: try to get first heading (h1, h2, h3) not in code block *) 51 + let headings = 52 + Html5rw.query result "h1" @ 53 + Html5rw.query result "h2" @ 54 + Html5rw.query result "h3" 55 + in 56 + let first_heading = List.find_opt (fun h -> not (is_in_code_block h)) headings in 57 + (match first_heading with 58 + | Some h -> 59 + let text = Html5rw.get_text_content h in 60 + normalize_and_truncate text 61 + | None -> None) 62 + with _ -> None 63 + 64 + (* Check if a string looks like HTML (contains tags) *) 65 + let looks_like_html s = 66 + let s = String.trim s in 67 + String.length s > 0 && ( 68 + (* Check for HTML tags *) 69 + try 70 + let _ = Str.search_forward (Str.regexp "<[a-zA-Z]") s 0 in 71 + true 72 + with Not_found -> false 73 + ) 74 + 75 + (* Clean summary: if it looks like full HTML, extract first paragraph *) 76 + let clean_summary = function 77 + | None -> None 78 + | Some s -> 79 + let s = String.trim s in 80 + if String.length s = 0 then None 81 + else if looks_like_html s then 82 + (* Count paragraphs to see if it's full content *) 83 + let para_count = 84 + let re = Str.regexp "<p[ >]" in 85 + let count = ref 0 in 86 + let p = ref 0 in 87 + (try while true do 88 + let _ = Str.search_forward re s !p in 89 + incr count; 90 + p := Str.match_end () 91 + done with Not_found -> ()); 92 + !count 93 + in 94 + if para_count > 2 then 95 + (* Looks like full HTML content, extract first paragraph *) 96 + match extract_first_paragraph s with 97 + | Some short -> Some short 98 + | None -> Some s (* Fallback to original if extraction fails *) 99 + else 100 + Some s 101 + else 102 + Some s 103 + 9 104 type output_format = Atom | Json | Html | Html_festive 10 105 11 106 let output_format_of_string s = ··· 104 199 ~content 105 200 ?url:(Post.url post) 106 201 ?title:(Post.title post) 107 - ?summary:(Post.summary post) 202 + ?summary:(clean_summary (Post.summary post)) 108 203 ?date_published:(Post.published post) 109 204 ?date_modified:(Post.updated post) 110 205 ?authors ··· 243 338 add " </author>\n" 244 339 | None -> ()); 245 340 246 - (match Post.summary post with 341 + (match clean_summary (Post.summary post) with 247 342 | Some s -> addf " <summary>%s</summary>\n" (escape_xml s) 248 343 | None -> ()); 249 344

+2

tuatara.opam

··· 32 32 "requests" 33 33 "xmlm" 34 34 "sortal" 35 + "html5rw" 36 + "bytesrw" 35 37 "odoc" {with-doc} 36 38 "alcotest" {with-test & >= "1.7.0"} 37 39 ]

Configure Feed

Configure Feed