personal memory agent
at main 208 lines 6.4 kB view raw
1# SPDX-License-Identifier: AGPL-3.0-only 2# Copyright (c) 2026 sol pbc 3 4"""Shared utilities for output extraction hooks. 5 6This module provides common functions used by extraction hooks like 7occurrence.py and anticipation.py in the talent/ directory. 8""" 9 10import json 11import logging 12import os 13from pathlib import Path 14 15# Minimum content length for meaningful event extraction 16MIN_EXTRACTION_CHARS = 50 17 18 19def should_skip_extraction(result: str, context: dict) -> str | None: 20 """Check if extraction should be skipped and return reason, or None to proceed. 21 22 Args: 23 result: The generated output markdown content. 24 context: Hook context dict with meta and span. 25 26 Returns: 27 Skip reason string if extraction should be skipped, None otherwise. 28 """ 29 meta = context.get("meta", {}) 30 31 # Skip if extraction disabled via journal config 32 if meta.get("extract") is False: 33 return "extraction disabled via journal config" 34 35 # Skip for JSON output (output IS the structured data) 36 if meta.get("output") == "json": 37 return "JSON output (already structured)" 38 39 # Skip in span mode (multiple sequential segments) 40 if context.get("span"): 41 return "span mode" 42 43 # Skip for minimal content 44 if len(result.strip()) < MIN_EXTRACTION_CHARS: 45 return f"minimal content ({len(result.strip())} chars < {MIN_EXTRACTION_CHARS})" 46 47 return None 48 49 50def log_extraction_failure(e: Exception, name: str) -> None: 51 """Log enhanced diagnostics for extraction generation failures. 52 53 Handles IncompleteJSONError specially by logging a single-line summary 54 with a head+tail sample and degenerate repetition detection. 55 56 Args: 57 e: The exception from generate(). 58 name: Generator name for log context. 59 """ 60 from think.models import IncompleteJSONError 61 62 if not isinstance(e, IncompleteJSONError): 63 logging.error("Extraction generation failed for %s: %s", name, e) 64 return 65 66 partial = e.partial_text 67 length = len(partial) 68 69 # Build single-line head+tail sample (newlines collapsed for log grep) 70 def _collapse(s: str) -> str: 71 return s.replace("\n", "\\n").replace("\r", "") 72 73 if length <= 300: 74 sample = _collapse(partial) 75 else: 76 sample = f"{_collapse(partial[:150])} ... {_collapse(partial[-150:])}" 77 78 # Repetition detection: count unique chars in last 1000 79 tail = partial[-1000:] if length >= 1000 else partial 80 unique_count = len(set(tail)) 81 repetition_flag = "" 82 if unique_count < 20: 83 repetition_flag = ( 84 f" [POSSIBLE DEGENERATE REPETITION: " 85 f"{unique_count} unique chars in last {len(tail)}]" 86 ) 87 88 logging.error( 89 "Extraction generation failed for %s: %s " 90 "(partial_text: %d chars, %d unique in tail%s) sample: %s", 91 name, 92 e, 93 length, 94 unique_count, 95 repetition_flag, 96 sample, 97 ) 98 99 100def write_events_jsonl( 101 events: list[dict], 102 agent: str, 103 occurred: bool, 104 source_output: str, 105 capture_day: str, 106) -> list[Path]: 107 """Write events to facet-based JSONL files. 108 109 Groups events by facet and writes each to the appropriate file: 110 facets/{facet}/events/{event_day}.jsonl 111 112 Args: 113 events: List of event dictionaries from extraction. 114 agent: Source generator agent (e.g., "meetings", "schedule"). 115 occurred: True for occurrences, False for anticipations. 116 source_output: Relative path to source output file. 117 capture_day: Day the output was captured (YYYYMMDD). 118 119 Returns: 120 List of paths to written JSONL files. 121 """ 122 from think.utils import get_journal 123 124 journal = get_journal() 125 126 # Group events by (facet, event_day) 127 grouped: dict[tuple[str, str], list[dict]] = {} 128 129 for event in events: 130 facet = event.get("facet", "") 131 if not facet: 132 continue # Skip events without facet 133 134 # Determine the event day 135 if occurred: 136 # Occurrences use capture day 137 event_day = capture_day 138 else: 139 # Anticipations use their scheduled date 140 event_date = event.get("date", "") 141 # Convert YYYY-MM-DD to YYYYMMDD 142 event_day = event_date.replace("-", "") if event_date else capture_day 143 144 if not event_day: 145 continue 146 147 key = (facet, event_day) 148 if key not in grouped: 149 grouped[key] = [] 150 151 # Enrich event with metadata 152 enriched = dict(event) 153 enriched["agent"] = agent 154 enriched["occurred"] = occurred 155 enriched["source"] = source_output 156 157 grouped[key].append(enriched) 158 159 # Write each group to its JSONL file 160 written_paths: list[Path] = [] 161 162 for (facet, event_day), facet_events in grouped.items(): 163 events_dir = Path(journal) / "facets" / facet / "events" 164 events_dir.mkdir(parents=True, exist_ok=True) 165 166 jsonl_path = events_dir / f"{event_day}.jsonl" 167 with open(jsonl_path, "a", encoding="utf-8") as f: 168 for event in facet_events: 169 f.write(json.dumps(event, ensure_ascii=False) + "\n") 170 171 written_paths.append(jsonl_path) 172 173 return written_paths 174 175 176def compute_output_source(context: dict) -> str: 177 """Compute relative source output path from hook context. 178 179 Args: 180 context: Hook context dict with day, segment, name, output_path, meta. 181 182 Returns: 183 Relative path like "20240101/agents/meetings.md". 184 """ 185 from think.talent import get_output_name 186 from think.utils import get_journal 187 188 day = context.get("day", "") 189 output_path = context.get("output_path", "") 190 name = context.get("name", "unknown") 191 journal = get_journal() 192 193 try: 194 return os.path.relpath(output_path, journal) 195 except ValueError: 196 segment = context.get("segment") 197 output_name = get_output_name(name) 198 # Check for facet in meta (for multi-facet agents) 199 meta = context.get("meta", {}) 200 facet = meta.get("facet") if meta else None 201 filename = f"{output_name}.md" 202 if segment and facet: 203 return os.path.join(day, segment, "agents", facet, filename) 204 if segment: 205 return os.path.join(day, segment, "agents", filename) 206 if facet: 207 return os.path.join(day, "agents", facet, filename) 208 return os.path.join(day, "agents", filename)