···255255 Buffer.add_substring buf content !pos (String.length content - !pos);
256256 Buffer.contents buf
257257258258- (* Fix Quarto-generated RSS feeds where description contains full HTML content.
259259- Quarto puts the entire post in <description> instead of using <content:encoded>.
260260- We extract the first paragraph as the summary and move content appropriately. *)
261261- let fix_quarto_descriptions content =
262262- (* Check if this is a Quarto-generated feed *)
263263- let is_quarto =
264264- try let _ = Str.search_forward (Str.regexp "<generator>quarto") content 0 in true
265265- with Not_found -> false
266266- in
267267- if not is_quarto then content
268268- else
269269- (* For each item, find description CDATA and extract first paragraph *)
270270- let desc_start_re = Str.regexp "<description><!\\[CDATA\\[" in
271271- let desc_end_re = Str.regexp "\\]\\]></description>" in
272272- let buf = Buffer.create (String.length content) in
273273- let pos = ref 0 in
274274- while
275275- try
276276- let start_pos = Str.search_forward desc_start_re content !pos in
277277- let cdata_start = Str.match_end () in
278278- let end_pos = Str.search_forward desc_end_re content cdata_start in
279279- let full_desc = String.sub content cdata_start (end_pos - cdata_start) in
280280- (* Only process if description looks like full HTML (has section/h2 tags) *)
281281- let is_full_html =
282282- try let _ = Str.search_forward (Str.regexp "<section\\|<h2\\|<h1") full_desc 0 in true
283283- with Not_found -> false
284284- in
285285- if is_full_html then begin
286286- (* Extract first paragraph text, stripping HTML *)
287287- let first_para =
288288- try
289289- let _p_start = Str.search_forward (Str.regexp "<p[^>]*>") full_desc 0 in
290290- let p_content_start = Str.match_end () in
291291- let p_end = Str.search_forward (Str.regexp "</p>") full_desc p_content_start in
292292- let para = String.sub full_desc p_content_start (p_end - p_content_start) in
293293- (* Strip HTML tags from paragraph *)
294294- let stripped = Str.global_replace (Str.regexp "<[^>]*>") "" para in
295295- (* Decode common HTML entities *)
296296- let stripped = Str.global_replace (Str.regexp "&") "&" stripped in
297297- let stripped = Str.global_replace (Str.regexp "<") "<" stripped in
298298- let stripped = Str.global_replace (Str.regexp ">") ">" stripped in
299299- let stripped = Str.global_replace (Str.regexp """) "\"" stripped in
300300- let stripped = Str.global_replace (Str.regexp "'") "'" stripped in
301301- let stripped = Str.global_replace (Str.regexp " ") " " stripped in
302302- (* Truncate if too long *)
303303- let max_len = 300 in
304304- if String.length stripped > max_len then
305305- String.sub stripped 0 max_len ^ "..."
306306- else
307307- stripped
308308- with Not_found -> ""
309309- in
310310- if String.length first_para > 0 then begin
311311- (* Add content before this description *)
312312- Buffer.add_substring buf content !pos (start_pos - !pos);
313313- (* Add short summary as description, full content as content:encoded *)
314314- Buffer.add_string buf "<description>";
315315- Buffer.add_string buf first_para;
316316- Buffer.add_string buf "</description>\n <content:encoded><![CDATA[";
317317- Buffer.add_string buf full_desc;
318318- Buffer.add_string buf "]]></content:encoded>";
319319- pos := end_pos + 14 (* length of "]]></description>" *)
320320- end else begin
321321- (* No paragraph found, keep as-is *)
322322- Buffer.add_substring buf content !pos (end_pos + 14 - !pos);
323323- pos := end_pos + 14
324324- end
325325- end else begin
326326- (* Not full HTML, keep as-is *)
327327- Buffer.add_substring buf content !pos (end_pos + 14 - !pos);
328328- pos := end_pos + 14
329329- end;
330330- true
331331- with Not_found -> false
332332- do () done;
333333- Buffer.add_substring buf content !pos (String.length content - !pos);
334334- Buffer.contents buf
335335-336258 let apply_for_url url content =
337337- (* Apply domain-specific fixes *)
259259+ (* Apply domain-specific fixes needed for parsing *)
338260 let content =
339261 if String.length url >= 20 &&
340262 try let _ = Str.search_forward (Str.regexp "digitalflapjack\\.com") url 0 in true
···345267 (* Apply generic Atom fixes - empty subtitle tags cause Syndic to fail *)
346268 let content = fix_empty_subtitle content in
347269 (* Fix Atom entries missing author elements *)
348348- let content = fix_missing_entry_authors content in
349349- (* Fix Quarto RSS feeds with full content in description *)
350350- fix_quarto_descriptions content
270270+ fix_missing_entry_authors content
271271+ (* Note: Summary extraction from full HTML content is now done at output time
272272+ in tuatara_output.ml, not here. This preserves original content in the database. *)
351273end
352274353275(* Main fetch function *)
+97-2
lib/core/tuatara_output.ml
···66module Post = Tuatara_schema.Post
77module Config = Tuatara_schema.Config
8899+(* Check if a node is inside a code block *)
1010+let is_in_code_block node =
1111+ let ancestors = Html5rw.ancestors node in
1212+ List.exists (fun anc ->
1313+ match Html5rw.get_attr anc "class" with
1414+ | Some cls ->
1515+ let words = String.split_on_char ' ' cls in
1616+ List.mem "cell" words || List.mem "code" words ||
1717+ List.mem "sourceCode" words || List.mem "code-fold" words
1818+ | None -> false
1919+ ) ancestors
2020+2121+(* Extract first meaningful paragraph from HTML content using html5rw.
2222+ Used at output time to create short summaries from full HTML content.
2323+ Falls back to heading text if no paragraph is found (e.g., for code notebooks). *)
2424+let extract_first_paragraph html =
2525+ try
2626+ let ctx = Html5rw.make_fragment_context ~tag_name:"div" () in
2727+ let reader = Bytesrw.Bytes.Reader.of_string html in
2828+ let result = Html5rw.parse ~fragment_context:ctx reader in
2929+3030+ let normalize_and_truncate text =
3131+ let text = Str.global_replace (Str.regexp "[ \t\n\r]+") " " text in
3232+ let text = String.trim text in
3333+ let max_len = 300 in
3434+ if String.length text > max_len then
3535+ Some (String.sub text 0 max_len ^ "...")
3636+ else if String.length text > 0 then
3737+ Some text
3838+ else
3939+ None
4040+ in
4141+4242+ (* First try: find first <p> that's not inside a code block *)
4343+ let paragraphs = Html5rw.query result "p" in
4444+ let first_para = List.find_opt (fun p -> not (is_in_code_block p)) paragraphs in
4545+ match first_para with
4646+ | Some p ->
4747+ let text = Html5rw.get_text_content p in
4848+ normalize_and_truncate text
4949+ | None ->
5050+ (* Fallback: try to get first heading (h1, h2, h3) not in code block *)
5151+ let headings =
5252+ Html5rw.query result "h1" @
5353+ Html5rw.query result "h2" @
5454+ Html5rw.query result "h3"
5555+ in
5656+ let first_heading = List.find_opt (fun h -> not (is_in_code_block h)) headings in
5757+ (match first_heading with
5858+ | Some h ->
5959+ let text = Html5rw.get_text_content h in
6060+ normalize_and_truncate text
6161+ | None -> None)
6262+ with _ -> None
6363+6464+(* Check if a string looks like HTML (contains tags) *)
6565+let looks_like_html s =
6666+ let s = String.trim s in
6767+ String.length s > 0 && (
6868+ (* Check for HTML tags *)
6969+ try
7070+ let _ = Str.search_forward (Str.regexp "<[a-zA-Z]") s 0 in
7171+ true
7272+ with Not_found -> false
7373+ )
7474+7575+(* Clean summary: if it looks like full HTML, extract first paragraph *)
7676+let clean_summary = function
7777+ | None -> None
7878+ | Some s ->
7979+ let s = String.trim s in
8080+ if String.length s = 0 then None
8181+ else if looks_like_html s then
8282+ (* Count paragraphs to see if it's full content *)
8383+ let para_count =
8484+ let re = Str.regexp "<p[ >]" in
8585+ let count = ref 0 in
8686+ let p = ref 0 in
8787+ (try while true do
8888+ let _ = Str.search_forward re s !p in
8989+ incr count;
9090+ p := Str.match_end ()
9191+ done with Not_found -> ());
9292+ !count
9393+ in
9494+ if para_count > 2 then
9595+ (* Looks like full HTML content, extract first paragraph *)
9696+ match extract_first_paragraph s with
9797+ | Some short -> Some short
9898+ | None -> Some s (* Fallback to original if extraction fails *)
9999+ else
100100+ Some s
101101+ else
102102+ Some s
103103+9104type output_format = Atom | Json | Html | Html_festive
1010511106let output_format_of_string s =
···104199 ~content
105200 ?url:(Post.url post)
106201 ?title:(Post.title post)
107107- ?summary:(Post.summary post)
202202+ ?summary:(clean_summary (Post.summary post))
108203 ?date_published:(Post.published post)
109204 ?date_modified:(Post.updated post)
110205 ?authors
···243338 add " </author>\n"
244339 | None -> ());
245340246246- (match Post.summary post with
341341+ (match clean_summary (Post.summary post) with
247342 | Some s -> addf " <summary>%s</summary>\n" (escape_xml s)
248343 | None -> ());
249344