a digital person for bluesky

Add facet extraction, embed handling, and thread visualization

Add facet extraction and embed handling. Enables void to properly read links, link previews, quotes, and see images. Also includes consecutive chain processing to enable passing the last post in multi-part replies.

Port features from umbra to void:

extract_links_from_facets: Extract link URLs with text from facets

extract_images_from_embed: Extract images from all embed types

extract_images_from_thread: Collect images from thread chronologically

extract_external_link_from_embed: Extract link card data

extract_quote_post_from_embed: Extract quoted post with status handling

extract_embed_data: Main entry point for embed extraction

Enhanced thread processing:

flatten_thread_structure now properly accesses AT Protocol properties

Extracts links from facets, embed data, and parent_uri

Thread visualization:

compute_tree_prefixes: Generate tree-style prefixes

build_tree_view: Build text visualization of thread structure

thread_to_yaml_string: Add include_tree_view parameter

Consecutive chain processing:

find_last_consecutive_post_in_chain: Traverse down to find last post

find_consecutive_parent_posts_by_author: Traverse up parent chain

authored by 3fz.org and committed by tangled.org f1b8cdcc b1954184

Changed files
+819 -18
+819 -18
bsky_utils.py
··· 115 115 return obj 116 116 117 117 118 + def extract_links_from_facets(record_text: str, facets: list) -> list: 119 + """ 120 + Extract link URLs from facets with their associated text. 121 + 122 + Args: 123 + record_text: The post text (needed to extract link text using byte offsets) 124 + facets: List of facet objects from post record 125 + 126 + Returns: 127 + List of dicts with 'url' and 'text' keys 128 + """ 129 + links = [] 130 + text_bytes = record_text.encode('utf-8') 131 + 132 + for facet in facets: 133 + for feature in facet.features: 134 + if hasattr(feature, 'uri'): # Link facet 135 + byte_start = facet.index.byte_start 136 + byte_end = facet.index.byte_end 137 + try: 138 + link_text = text_bytes[byte_start:byte_end].decode('utf-8') 139 + except (UnicodeDecodeError, IndexError): 140 + link_text = feature.uri # Fallback to URL itself 141 + links.append({ 142 + 'url': feature.uri, 143 + 'text': link_text 144 + }) 145 + return links 146 + 147 + 148 + def extract_images_from_embed(embed, include_thumbnails: bool = True) -> list[dict]: 149 + """Extract image URLs and alt text from a post embed (View type). 150 + 151 + This function handles the View types returned by get_post_thread(), 152 + which contain CDN URLs for images (unlike raw record embeds which 153 + only have BlobRefs). 154 + 155 + Also extracts thumbnails from external links and videos when include_thumbnails=True. 156 + 157 + Args: 158 + embed: The embed object from post.embed (View type) 159 + include_thumbnails: Whether to include thumbnails from links/videos (default True) 160 + 161 + Returns: 162 + List of dicts with 'fullsize', 'thumb', 'alt', and optional 'source' keys 163 + """ 164 + images = [] 165 + if not embed: 166 + return images 167 + 168 + embed_type = getattr(embed, 'py_type', '') 169 + 170 + # Direct image embed (app.bsky.embed.images#view) 171 + if 'images' in embed_type and 'record' not in embed_type: 172 + for img in embed.images: 173 + images.append({ 174 + 'fullsize': getattr(img, 'fullsize', None), 175 + 'thumb': getattr(img, 'thumb', None), 176 + 'alt': getattr(img, 'alt', '') or '' 177 + }) 178 + 179 + # External link with thumbnail (app.bsky.embed.external#view) 180 + elif 'external' in embed_type and 'record' not in embed_type and include_thumbnails: 181 + if hasattr(embed, 'external') and embed.external: 182 + thumb = getattr(embed.external, 'thumb', None) 183 + if thumb: 184 + title = getattr(embed.external, 'title', '') or '' 185 + images.append({ 186 + 'fullsize': thumb, # External links only have thumb, use as fullsize too 187 + 'thumb': thumb, 188 + 'alt': f"Link preview: {title}" if title else 'Link preview image', 189 + 'source': 'external_link' 190 + }) 191 + 192 + # Video with thumbnail (app.bsky.embed.video#view) 193 + elif 'video' in embed_type and 'record' not in embed_type and include_thumbnails: 194 + thumb = getattr(embed, 'thumbnail', None) 195 + if thumb: 196 + alt = getattr(embed, 'alt', '') or 'Video thumbnail' 197 + images.append({ 198 + 'fullsize': thumb, 199 + 'thumb': thumb, 200 + 'alt': alt, 201 + 'source': 'video' 202 + }) 203 + 204 + # Quote post with media (app.bsky.embed.recordWithMedia#view) 205 + elif 'recordWithMedia' in embed_type and hasattr(embed, 'media'): 206 + media_type = getattr(embed.media, 'py_type', '') 207 + # Images in media 208 + if 'images' in media_type and hasattr(embed.media, 'images'): 209 + for img in embed.media.images: 210 + images.append({ 211 + 'fullsize': getattr(img, 'fullsize', None), 212 + 'thumb': getattr(img, 'thumb', None), 213 + 'alt': getattr(img, 'alt', '') or '' 214 + }) 215 + # External link thumbnail in media 216 + elif 'external' in media_type and include_thumbnails: 217 + if hasattr(embed.media, 'external') and embed.media.external: 218 + thumb = getattr(embed.media.external, 'thumb', None) 219 + if thumb: 220 + title = getattr(embed.media.external, 'title', '') or '' 221 + images.append({ 222 + 'fullsize': thumb, 223 + 'thumb': thumb, 224 + 'alt': f"Link preview: {title}" if title else 'Link preview image', 225 + 'source': 'external_link' 226 + }) 227 + # Video thumbnail in media 228 + elif 'video' in media_type and include_thumbnails: 229 + thumb = getattr(embed.media, 'thumbnail', None) 230 + if thumb: 231 + alt = getattr(embed.media, 'alt', '') or 'Video thumbnail' 232 + images.append({ 233 + 'fullsize': thumb, 234 + 'thumb': thumb, 235 + 'alt': alt, 236 + 'source': 'video' 237 + }) 238 + 239 + # Quote post - check for images in nested embeds (app.bsky.embed.record#view) 240 + elif 'record' in embed_type and 'recordWithMedia' not in embed_type: 241 + if hasattr(embed, 'record') and embed.record: 242 + record = embed.record 243 + if hasattr(record, 'embeds') and record.embeds: 244 + for nested in record.embeds: 245 + nested_type = getattr(nested, 'py_type', '') 246 + # Nested images 247 + if 'images' in nested_type and hasattr(nested, 'images'): 248 + for img in nested.images: 249 + images.append({ 250 + 'fullsize': getattr(img, 'fullsize', None), 251 + 'thumb': getattr(img, 'thumb', None), 252 + 'alt': getattr(img, 'alt', '') or '', 253 + 'source': 'quoted_post' 254 + }) 255 + # Nested external link thumbnail 256 + elif 'external' in nested_type and include_thumbnails: 257 + if hasattr(nested, 'external') and nested.external: 258 + thumb = getattr(nested.external, 'thumb', None) 259 + if thumb: 260 + title = getattr(nested.external, 'title', '') or '' 261 + images.append({ 262 + 'fullsize': thumb, 263 + 'thumb': thumb, 264 + 'alt': f"Link preview: {title}" if title else 'Link preview image', 265 + 'source': 'quoted_post_link' 266 + }) 267 + # Nested video thumbnail 268 + elif 'video' in nested_type and include_thumbnails: 269 + thumb = getattr(nested, 'thumbnail', None) 270 + if thumb: 271 + alt = getattr(nested, 'alt', '') or 'Video thumbnail' 272 + images.append({ 273 + 'fullsize': thumb, 274 + 'thumb': thumb, 275 + 'alt': alt, 276 + 'source': 'quoted_post_video' 277 + }) 278 + 279 + return images 280 + 281 + 282 + def extract_images_from_thread(thread_data, max_images: int = 8) -> list[dict]: 283 + """Extract all images from a thread, up to max_images. 284 + 285 + Traverses the thread structure and extracts image URLs from post embeds. 286 + Images are collected in chronological order (parents before children). 287 + 288 + Args: 289 + thread_data: The thread data from get_post_thread 290 + max_images: Maximum number of images to extract (default 8) 291 + 292 + Returns: 293 + List of image dicts with 'fullsize', 'thumb', 'alt', 'author_handle' keys 294 + """ 295 + images = [] 296 + 297 + def traverse_thread(node): 298 + if not node or len(images) >= max_images: 299 + return 300 + 301 + # Traverse parent first (chronological order) 302 + if hasattr(node, 'parent') and node.parent: 303 + traverse_thread(node.parent) 304 + 305 + # Extract images from this post's embed (View type, not record.embed) 306 + if hasattr(node, 'post') and node.post: 307 + post = node.post 308 + if hasattr(post, 'embed') and post.embed: 309 + post_images = extract_images_from_embed(post.embed) 310 + author_handle = getattr(post.author, 'handle', 'unknown') if hasattr(post, 'author') else 'unknown' 311 + for img in post_images: 312 + if len(images) >= max_images: 313 + break 314 + img['author_handle'] = author_handle 315 + images.append(img) 316 + 317 + # Traverse replies 318 + if hasattr(node, 'replies') and node.replies: 319 + for reply in node.replies: 320 + if len(images) >= max_images: 321 + break 322 + traverse_thread(reply) 323 + 324 + if hasattr(thread_data, 'thread'): 325 + traverse_thread(thread_data.thread) 326 + 327 + return images 328 + 329 + 330 + def extract_external_link_from_embed(embed) -> dict | None: 331 + """Extract external link card data from a post embed (View type). 332 + 333 + External links are shown as "link cards" with URL, title, description, 334 + and optional thumbnail. 335 + 336 + Args: 337 + embed: The embed object from post.embed (View type) 338 + 339 + Returns: 340 + Dict with 'url', 'title', 'description', 'thumbnail' keys, or None 341 + """ 342 + if not embed: 343 + return None 344 + 345 + embed_type = getattr(embed, 'py_type', '') 346 + 347 + # Direct external link embed (app.bsky.embed.external#view) 348 + if 'external' in embed_type and hasattr(embed, 'external'): 349 + external = embed.external 350 + return { 351 + 'url': getattr(external, 'uri', ''), 352 + 'title': getattr(external, 'title', ''), 353 + 'description': getattr(external, 'description', ''), 354 + 'thumbnail': getattr(external, 'thumb', None) 355 + } 356 + 357 + # RecordWithMedia with external link (app.bsky.embed.recordWithMedia#view) 358 + if 'recordWithMedia' in embed_type and hasattr(embed, 'media'): 359 + media_type = getattr(embed.media, 'py_type', '') 360 + if 'external' in media_type and hasattr(embed.media, 'external'): 361 + external = embed.media.external 362 + return { 363 + 'url': getattr(external, 'uri', ''), 364 + 'title': getattr(external, 'title', ''), 365 + 'description': getattr(external, 'description', ''), 366 + 'thumbnail': getattr(external, 'thumb', None) 367 + } 368 + 369 + return None 370 + 371 + 372 + def extract_quote_post_from_embed(embed) -> dict | None: 373 + """Extract quoted post data from a record embed (View type). 374 + 375 + Quote posts embed another post, which can include the quoted text, 376 + author, and any media attached to the quoted post. 377 + 378 + Args: 379 + embed: The embed object from post.embed (View type) 380 + 381 + Returns: 382 + Dict with quote post data, or None if not a quote or unavailable 383 + """ 384 + if not embed: 385 + return None 386 + 387 + embed_type = getattr(embed, 'py_type', '') 388 + 389 + # Get the record object (works for both record and recordWithMedia) 390 + record = None 391 + if 'recordWithMedia' in embed_type and hasattr(embed, 'record'): 392 + # recordWithMedia has record.record for the actual quote 393 + record = getattr(embed.record, 'record', None) 394 + elif 'record' in embed_type and hasattr(embed, 'record'): 395 + record = embed.record 396 + 397 + if not record: 398 + return None 399 + 400 + record_type = getattr(record, 'py_type', '') 401 + 402 + # Handle different quote post states 403 + if 'viewNotFound' in record_type: 404 + return { 405 + 'status': 'not_found', 406 + 'uri': getattr(record, 'uri', ''), 407 + 'message': 'Quoted post was deleted or not found' 408 + } 409 + 410 + if 'viewBlocked' in record_type: 411 + return { 412 + 'status': 'blocked', 413 + 'uri': getattr(record, 'uri', ''), 414 + 'message': 'Quoted post is from a blocked account' 415 + } 416 + 417 + if 'viewDetached' in record_type: 418 + return { 419 + 'status': 'detached', 420 + 'uri': getattr(record, 'uri', ''), 421 + 'message': 'Quoted post was detached' 422 + } 423 + 424 + # Normal quote post (viewRecord) 425 + if 'viewRecord' in record_type or hasattr(record, 'author'): 426 + result = { 427 + 'status': 'available', 428 + 'uri': getattr(record, 'uri', ''), 429 + } 430 + 431 + # Extract author info 432 + if hasattr(record, 'author') and record.author: 433 + author = record.author 434 + result['author'] = { 435 + 'handle': getattr(author, 'handle', 'unknown'), 436 + 'display_name': getattr(author, 'display_name', '') or getattr(author, 'handle', 'unknown') 437 + } 438 + 439 + # Extract the quoted post text from value 440 + # The 'value' field contains the actual post record 441 + if hasattr(record, 'value') and record.value: 442 + value = record.value 443 + # value can be a dict or an object 444 + if isinstance(value, dict): 445 + result['text'] = value.get('text', '') 446 + elif hasattr(value, 'text'): 447 + result['text'] = getattr(value, 'text', '') 448 + 449 + # Extract engagement metrics if present 450 + metrics = {} 451 + if hasattr(record, 'like_count') and record.like_count is not None: 452 + metrics['likes'] = record.like_count 453 + if hasattr(record, 'repost_count') and record.repost_count is not None: 454 + metrics['reposts'] = record.repost_count 455 + if hasattr(record, 'reply_count') and record.reply_count is not None: 456 + metrics['replies'] = record.reply_count 457 + if hasattr(record, 'quote_count') and record.quote_count is not None: 458 + metrics['quotes'] = record.quote_count 459 + if metrics: 460 + result['metrics'] = metrics 461 + 462 + # Add thread context hints (for hybrid thread navigation) 463 + thread_context = {} 464 + 465 + # Reply count indicates replies exist below this post 466 + if metrics.get('replies'): 467 + thread_context['reply_count'] = metrics['replies'] 468 + 469 + # Check if quoted post is itself a reply (has parents above) 470 + if hasattr(record, 'value') and record.value: 471 + value = record.value 472 + reply_ref = value.get('reply') if isinstance(value, dict) else getattr(value, 'reply', None) 473 + if reply_ref: 474 + thread_context['has_parents'] = True 475 + 476 + if thread_context: 477 + result['thread_context'] = thread_context 478 + 479 + # Check for nested embeds in the quoted post 480 + if hasattr(record, 'embeds') and record.embeds: 481 + nested_embeds = [] 482 + for nested in record.embeds: 483 + nested_type = getattr(nested, 'py_type', '') 484 + if 'images' in nested_type: 485 + nested_embeds.append({'type': 'images', 'count': len(getattr(nested, 'images', []))}) 486 + elif 'video' in nested_type: 487 + nested_embeds.append({'type': 'video'}) 488 + elif 'external' in nested_type: 489 + ext = getattr(nested, 'external', None) 490 + if ext: 491 + nested_embeds.append({ 492 + 'type': 'external_link', 493 + 'url': getattr(ext, 'uri', ''), 494 + 'title': getattr(ext, 'title', '') 495 + }) 496 + if nested_embeds: 497 + result['embeds'] = nested_embeds 498 + 499 + return result 500 + 501 + return None 502 + 503 + 504 + def extract_embed_data(embed) -> dict | None: 505 + """Extract structured data from any embed type. 506 + 507 + This is the main entry point for embed extraction. It detects the embed 508 + type and delegates to the appropriate extraction function. 509 + 510 + Args: 511 + embed: The embed object from post.embed (View type) 512 + 513 + Returns: 514 + Dict with embed type and extracted data, or None if no embed 515 + """ 516 + if not embed: 517 + return None 518 + 519 + embed_type = getattr(embed, 'py_type', '') 520 + 521 + # Images 522 + if 'images' in embed_type and 'record' not in embed_type: 523 + images = extract_images_from_embed(embed) 524 + if images: 525 + return { 526 + 'type': 'images', 527 + 'images': images 528 + } 529 + 530 + # External link 531 + if 'external' in embed_type and 'record' not in embed_type: 532 + link = extract_external_link_from_embed(embed) 533 + if link: 534 + return { 535 + 'type': 'external_link', 536 + 'link': link 537 + } 538 + 539 + # Quote post (record) 540 + if embed_type == 'app.bsky.embed.record#view': 541 + quote = extract_quote_post_from_embed(embed) 542 + if quote: 543 + return { 544 + 'type': 'quote_post', 545 + 'quote': quote 546 + } 547 + 548 + # Quote post with media (recordWithMedia) 549 + if 'recordWithMedia' in embed_type: 550 + result = {'type': 'quote_with_media'} 551 + 552 + # Extract the quote 553 + quote = extract_quote_post_from_embed(embed) 554 + if quote: 555 + result['quote'] = quote 556 + 557 + # Extract the media 558 + if hasattr(embed, 'media'): 559 + media_type = getattr(embed.media, 'py_type', '') 560 + if 'images' in media_type: 561 + images = extract_images_from_embed(embed) 562 + if images: 563 + result['media'] = {'type': 'images', 'images': images} 564 + elif 'external' in media_type: 565 + link = extract_external_link_from_embed(embed) 566 + if link: 567 + result['media'] = {'type': 'external_link', 'link': link} 568 + elif 'video' in media_type: 569 + # Basic video info 570 + result['media'] = { 571 + 'type': 'video', 572 + 'thumbnail': getattr(embed.media, 'thumbnail', None), 573 + 'alt': getattr(embed.media, 'alt', None) 574 + } 575 + 576 + return result 577 + 578 + # Video (basic handling) 579 + if 'video' in embed_type: 580 + return { 581 + 'type': 'video', 582 + 'thumbnail': getattr(embed, 'thumbnail', None), 583 + 'alt': getattr(embed, 'alt', None) 584 + } 585 + 586 + return None 587 + 588 + 118 589 def flatten_thread_structure(thread_data): 119 590 """ 120 591 Flatten a nested thread structure into a list while preserving all data. 121 - 592 + 122 593 Args: 123 594 thread_data: The thread data from get_post_thread 124 - 595 + 125 596 Returns: 126 597 Dict with 'posts' key containing a list of posts in chronological order 127 598 """ 128 599 posts = [] 129 - 600 + 130 601 def traverse_thread(node): 131 602 """Recursively traverse the thread structure to collect posts.""" 132 603 if not node: 133 604 return 134 - 605 + 135 606 # If this node has a parent, traverse it first (to maintain chronological order) 136 607 if hasattr(node, 'parent') and node.parent: 137 608 traverse_thread(node.parent) 138 - 609 + 139 610 # Then add this node's post 140 611 if hasattr(node, 'post') and node.post: 141 - # Convert to dict if needed to ensure we can process it 142 - if hasattr(node.post, '__dict__'): 143 - post_dict = node.post.__dict__.copy() 144 - elif isinstance(node.post, dict): 145 - post_dict = node.post.copy() 146 - else: 147 - post_dict = {} 148 - 612 + # Extract post data by accessing properties directly (not __dict__) 613 + # AT Protocol objects store data in properties, not __dict__ 614 + post = node.post 615 + 616 + # Build post dict with proper property access 617 + post_dict = {} 618 + 619 + # Extract basic fields 620 + if hasattr(post, 'uri'): 621 + post_dict['uri'] = post.uri 622 + if hasattr(post, 'cid'): 623 + post_dict['cid'] = post.cid 624 + 625 + # Extract author info 626 + if hasattr(post, 'author') and post.author: 627 + author = post.author 628 + post_dict['author'] = { 629 + 'handle': getattr(author, 'handle', 'unknown'), 630 + 'display_name': getattr(author, 'display_name', 'unknown'), 631 + 'did': getattr(author, 'did', 'unknown') 632 + } 633 + 634 + # Extract record info (text, created_at, etc.) 635 + if hasattr(post, 'record') and post.record: 636 + record = post.record 637 + record_dict = { 638 + 'text': getattr(record, 'text', ''), 639 + 'createdAt': getattr(record, 'created_at', 'unknown') 640 + } 641 + 642 + # Extract links from facets if present 643 + if hasattr(record, 'facets') and record.facets: 644 + links = extract_links_from_facets( 645 + getattr(record, 'text', ''), 646 + record.facets 647 + ) 648 + if links: 649 + record_dict['links'] = links 650 + 651 + post_dict['record'] = record_dict 652 + 653 + # Extract embed data from post.embed (View type with CDN URLs) 654 + # This is different from record.embed which only has raw BlobRefs 655 + if hasattr(post, 'embed') and post.embed: 656 + embed_data = extract_embed_data(post.embed) 657 + if embed_data: 658 + post_dict['embed'] = embed_data 659 + 660 + # Extract parent_uri for tree visualization 661 + parent_uri = None 662 + if hasattr(post, 'record') and post.record: 663 + record_obj = post.record 664 + if hasattr(record_obj, 'reply') and record_obj.reply: 665 + reply_ref = record_obj.reply 666 + if hasattr(reply_ref, 'parent') and reply_ref.parent: 667 + if hasattr(reply_ref.parent, 'uri'): 668 + parent_uri = reply_ref.parent.uri 669 + post_dict['parent_uri'] = parent_uri 670 + 149 671 posts.append(post_dict) 150 - 672 + 673 + # Then traverse any replies (going DOWN the thread) 674 + if hasattr(node, 'replies') and node.replies: 675 + for reply in node.replies: 676 + traverse_thread(reply) 677 + 151 678 # Handle the thread structure 152 679 if hasattr(thread_data, 'thread'): 153 680 # Start from the main thread node 154 681 traverse_thread(thread_data.thread) 155 682 elif hasattr(thread_data, '__dict__') and 'thread' in thread_data.__dict__: 156 683 traverse_thread(thread_data.__dict__['thread']) 157 - 684 + 158 685 # Return a simple structure with posts list 159 686 return {'posts': posts} 160 687 ··· 173 700 return len(flattened.get('posts', [])) 174 701 175 702 176 - def thread_to_yaml_string(thread, strip_metadata=True): 703 + def compute_tree_prefixes(posts: List[Dict]) -> Dict[str, str]: 704 + """ 705 + Compute tree-style prefixes based on parent relationships. 706 + 707 + Args: 708 + posts: List of post dicts, each with 'uri' and 'parent_uri' keys 709 + 710 + Returns: 711 + Dict mapping uri -> prefix string (e.g., "├─ ", "│ └─ ") 712 + """ 713 + if not posts: 714 + return {} 715 + 716 + uri_to_post = {p.get('uri'): p for p in posts if p.get('uri')} 717 + children_map: Dict[str, List[str]] = {} # parent_uri -> [child_uris] 718 + root_uris: List[str] = [] 719 + 720 + for post in posts: 721 + uri = post.get('uri') 722 + if not uri: 723 + continue 724 + parent_uri = post.get('parent_uri') 725 + if not parent_uri or parent_uri not in uri_to_post: 726 + root_uris.append(uri) 727 + else: 728 + children_map.setdefault(parent_uri, []).append(uri) 729 + 730 + prefixes: Dict[str, str] = {} 731 + visited: set = set() 732 + 733 + def compute_recursive(uri: str, ancestors_last: List[bool]): 734 + if uri in visited: 735 + return 736 + visited.add(uri) 737 + 738 + prefix_parts = [] 739 + for is_last in ancestors_last[:-1]: 740 + prefix_parts.append(" " if is_last else "│ ") 741 + if ancestors_last: 742 + prefix_parts.append("└─ " if ancestors_last[-1] else "├─ ") 743 + prefixes[uri] = "".join(prefix_parts) 744 + 745 + children = children_map.get(uri, []) 746 + for i, child_uri in enumerate(children): 747 + compute_recursive(child_uri, ancestors_last + [i == len(children) - 1]) 748 + 749 + for i, root_uri in enumerate(root_uris): 750 + if len(root_uris) == 1: 751 + prefixes[root_uri] = "" 752 + children = children_map.get(root_uri, []) 753 + for j, child_uri in enumerate(children): 754 + compute_recursive(child_uri, [j == len(children) - 1]) 755 + else: 756 + compute_recursive(root_uri, [i == len(root_uris) - 1]) 757 + 758 + return prefixes 759 + 760 + 761 + def build_tree_view(posts: List[Dict]) -> str: 762 + """ 763 + Build a tree-style text visualization of a thread. 764 + 765 + Args: 766 + posts: List of post dicts with uri, parent_uri, author, record fields 767 + 768 + Returns: 769 + Multi-line string showing thread structure with tree prefixes 770 + """ 771 + if not posts: 772 + return "(empty thread)" 773 + 774 + prefixes = compute_tree_prefixes(posts) 775 + lines = [] 776 + 777 + for post in posts: 778 + uri = post.get('uri', '') 779 + prefix = prefixes.get(uri, '') 780 + 781 + author = post.get('author', {}) 782 + handle = author.get('handle', 'unknown') 783 + record = post.get('record', {}) 784 + text = record.get('text', '').replace('\n', ' | ') 785 + 786 + lines.append(f"{prefix}@{handle}: {text}") 787 + 788 + return "\n".join(lines) 789 + 790 + 791 + def thread_to_yaml_string(thread, strip_metadata=True, include_tree_view=True): 177 792 """ 178 793 Convert thread data to a YAML-formatted string for LLM parsing. 179 794 180 795 Args: 181 796 thread: The thread data from get_post_thread 182 797 strip_metadata: Whether to strip metadata fields for cleaner output 798 + include_tree_view: Whether to prepend a tree visualization of the thread 183 799 184 800 Returns: 185 - YAML-formatted string representation of the thread 801 + String representation of the thread with optional tree view and YAML data 186 802 """ 187 803 # First flatten the thread structure to avoid deep nesting 188 804 flattened = flatten_thread_structure(thread) 805 + posts = flattened.get('posts', []) 806 + 807 + output_parts = [] 808 + 809 + # Build tree visualization if requested 810 + if include_tree_view and posts: 811 + tree_view = build_tree_view(posts) 812 + output_parts.append("THREAD STRUCTURE:") 813 + output_parts.append(tree_view) 814 + output_parts.append("") 815 + output_parts.append("FULL POST DATA:") 189 816 190 817 # Convert complex objects to basic types 191 818 basic_thread = convert_to_basic_types(flattened) ··· 196 823 else: 197 824 cleaned_thread = basic_thread 198 825 199 - return yaml.dump(cleaned_thread, indent=2, allow_unicode=True, default_flow_style=False) 826 + yaml_output = yaml.dump(cleaned_thread, indent=2, allow_unicode=True, default_flow_style=False) 827 + output_parts.append(yaml_output) 828 + 829 + return "\n".join(output_parts) 200 830 201 831 202 832 ··· 512 1142 except Exception as e: 513 1143 logger.error(f"Error fetching post thread: {e}") 514 1144 return None 1145 + 1146 + 1147 + def find_last_consecutive_post_in_chain(thread_node, author_handle: str): 1148 + """ 1149 + Find the last consecutive post in the direct reply chain by the same author. 1150 + 1151 + Starting from the given thread node, this function traverses down the direct reply chain 1152 + (not all branches) to find the last consecutive post made by the specified author. 1153 + 1154 + Args: 1155 + thread_node: The thread node to start from (usually the mention post's thread node) 1156 + author_handle: The handle of the author to match (e.g., "user.bsky.social") 1157 + 1158 + Returns: 1159 + Tuple of (uri, cid, text) for the last consecutive post by the author, or None if no consecutive posts 1160 + 1161 + Example: 1162 + If the thread structure is: 1163 + - Post A by @alice (mention) -> thread_node starts here 1164 + - Post B by @alice (consecutive) 1165 + - Post C by @alice (consecutive) 1166 + - Post D by @bob (different author, stop here) 1167 + 1168 + Returns (uri_C, cid_C, text_C) 1169 + """ 1170 + if not thread_node: 1171 + return None 1172 + 1173 + # Start with the current node's post 1174 + current_post = None 1175 + if hasattr(thread_node, 'post') and thread_node.post: 1176 + current_post = thread_node.post 1177 + 1178 + if not current_post: 1179 + return None 1180 + 1181 + # Check if current post is by the target author 1182 + current_author = None 1183 + if hasattr(current_post, 'author') and hasattr(current_post.author, 'handle'): 1184 + current_author = current_post.author.handle 1185 + 1186 + if current_author != author_handle: 1187 + # Current post is not by target author, can't find consecutive posts 1188 + return None 1189 + 1190 + # Track the last consecutive post (start with current) 1191 + last_uri = current_post.uri if hasattr(current_post, 'uri') else None 1192 + last_cid = current_post.cid if hasattr(current_post, 'cid') else None 1193 + last_text = "" 1194 + if hasattr(current_post, 'record') and hasattr(current_post.record, 'text'): 1195 + last_text = current_post.record.text 1196 + 1197 + # Traverse down the direct reply chain 1198 + current_node = thread_node 1199 + while True: 1200 + # Check if there are replies to this node 1201 + if not hasattr(current_node, 'replies') or not current_node.replies: 1202 + # No more replies, we've found the last consecutive post 1203 + break 1204 + 1205 + # For direct chain traversal, we look for replies by the same author 1206 + # If there are multiple replies, we'll take the first one by the same author 1207 + next_node = None 1208 + for reply in current_node.replies: 1209 + if hasattr(reply, 'post') and reply.post: 1210 + reply_author = None 1211 + if hasattr(reply.post, 'author') and hasattr(reply.post.author, 'handle'): 1212 + reply_author = reply.post.author.handle 1213 + 1214 + if reply_author == author_handle: 1215 + # Found a consecutive post by same author 1216 + next_node = reply 1217 + break 1218 + 1219 + if not next_node: 1220 + # No more consecutive posts by same author 1221 + break 1222 + 1223 + # Update last post info to this consecutive post 1224 + current_node = next_node 1225 + current_post = current_node.post 1226 + 1227 + if hasattr(current_post, 'uri'): 1228 + last_uri = current_post.uri 1229 + if hasattr(current_post, 'cid'): 1230 + last_cid = current_post.cid 1231 + if hasattr(current_post, 'record') and hasattr(current_post.record, 'text'): 1232 + last_text = current_post.record.text 1233 + 1234 + # Return the last consecutive post's metadata 1235 + # Only return if we actually have valid URI and CID 1236 + if last_uri and last_cid: 1237 + return (last_uri, last_cid, last_text) 1238 + 1239 + return None 1240 + 1241 + 1242 + def find_consecutive_parent_posts_by_author(thread_node, author_handle: str) -> List[Dict]: 1243 + """ 1244 + Find consecutive posts by the same author in the parent chain. 1245 + 1246 + Starting from the given thread node, this function traverses UP the parent chain 1247 + to find all consecutive posts made by the specified author. 1248 + 1249 + This is the inverse of find_last_consecutive_post_in_chain which traverses DOWN. 1250 + 1251 + Args: 1252 + thread_node: The thread node to start from (the notification post's thread node) 1253 + author_handle: The handle of the author to match (e.g., "user.bsky.social") 1254 + 1255 + Returns: 1256 + List of post dicts for consecutive posts by the author in the parent chain, 1257 + in chronological order (oldest first). Returns empty list if no parent posts 1258 + by the same author. 1259 + 1260 + Example: 1261 + If the thread structure is: 1262 + - Post A by @alice (first part) 1263 + - Post B by @alice (consecutive) <- start from here (notification) 1264 + 1265 + Returns [Post A dict] (not including Post B since that's the current node) 1266 + """ 1267 + parent_posts = [] 1268 + 1269 + if not thread_node: 1270 + return parent_posts 1271 + 1272 + # Traverse up the parent chain 1273 + current_node = thread_node 1274 + while True: 1275 + # Check if this node has a parent 1276 + if not hasattr(current_node, 'parent') or not current_node.parent: 1277 + break 1278 + 1279 + parent_node = current_node.parent 1280 + if not hasattr(parent_node, 'post') or not parent_node.post: 1281 + break 1282 + 1283 + parent_post = parent_node.post 1284 + 1285 + # Check if parent is by the same author 1286 + parent_author = None 1287 + if hasattr(parent_post, 'author') and hasattr(parent_post.author, 'handle'): 1288 + parent_author = parent_post.author.handle 1289 + 1290 + if parent_author != author_handle: 1291 + # Parent is by different author, stop here 1292 + break 1293 + 1294 + # Collect this parent post 1295 + post_dict = { 1296 + 'uri': getattr(parent_post, 'uri', ''), 1297 + 'cid': getattr(parent_post, 'cid', ''), 1298 + 'author': { 1299 + 'handle': parent_author, 1300 + 'display_name': getattr(parent_post.author, 'display_name', '') if hasattr(parent_post, 'author') else '', 1301 + 'did': getattr(parent_post.author, 'did', '') if hasattr(parent_post, 'author') else '' 1302 + }, 1303 + 'record': { 1304 + 'text': getattr(parent_post.record, 'text', '') if hasattr(parent_post, 'record') else '', 1305 + 'createdAt': getattr(parent_post.record, 'created_at', '') if hasattr(parent_post, 'record') else '' 1306 + } 1307 + } 1308 + parent_posts.append(post_dict) 1309 + 1310 + # Move up to the next parent 1311 + current_node = parent_node 1312 + 1313 + # Return in chronological order (oldest first) 1314 + parent_posts.reverse() 1315 + return parent_posts 515 1316 516 1317 517 1318 def reply_to_notification(client: Client, notification: Any, reply_text: str, lang: str = "en-US", correlation_id: Optional[str] = None) -> Optional[Dict[str, Any]]: