personal memory agent
1# SPDX-License-Identifier: AGPL-3.0-only
2# Copyright (c) 2026 sol pbc
3
4"""Shared utilities for output extraction hooks.
5
6This module provides common functions used by extraction hooks like
7occurrence.py and anticipation.py in the talent/ directory.
8"""
9
10import json
11import logging
12import os
13from pathlib import Path
14
15# Minimum content length for meaningful event extraction
16MIN_EXTRACTION_CHARS = 50
17
18
19def should_skip_extraction(result: str, context: dict) -> str | None:
20 """Check if extraction should be skipped and return reason, or None to proceed.
21
22 Args:
23 result: The generated output markdown content.
24 context: Hook context dict with meta and span.
25
26 Returns:
27 Skip reason string if extraction should be skipped, None otherwise.
28 """
29 meta = context.get("meta", {})
30
31 # Skip if extraction disabled via journal config
32 if meta.get("extract") is False:
33 return "extraction disabled via journal config"
34
35 # Skip for JSON output (output IS the structured data)
36 if meta.get("output") == "json":
37 return "JSON output (already structured)"
38
39 # Skip in span mode (multiple sequential segments)
40 if context.get("span"):
41 return "span mode"
42
43 # Skip for minimal content
44 if len(result.strip()) < MIN_EXTRACTION_CHARS:
45 return f"minimal content ({len(result.strip())} chars < {MIN_EXTRACTION_CHARS})"
46
47 return None
48
49
50def log_extraction_failure(e: Exception, name: str) -> None:
51 """Log enhanced diagnostics for extraction generation failures.
52
53 Handles IncompleteJSONError specially by logging a single-line summary
54 with a head+tail sample and degenerate repetition detection.
55
56 Args:
57 e: The exception from generate().
58 name: Generator name for log context.
59 """
60 from think.models import IncompleteJSONError
61
62 if not isinstance(e, IncompleteJSONError):
63 logging.error("Extraction generation failed for %s: %s", name, e)
64 return
65
66 partial = e.partial_text
67 length = len(partial)
68
69 # Build single-line head+tail sample (newlines collapsed for log grep)
70 def _collapse(s: str) -> str:
71 return s.replace("\n", "\\n").replace("\r", "")
72
73 if length <= 300:
74 sample = _collapse(partial)
75 else:
76 sample = f"{_collapse(partial[:150])} ... {_collapse(partial[-150:])}"
77
78 # Repetition detection: count unique chars in last 1000
79 tail = partial[-1000:] if length >= 1000 else partial
80 unique_count = len(set(tail))
81 repetition_flag = ""
82 if unique_count < 20:
83 repetition_flag = (
84 f" [POSSIBLE DEGENERATE REPETITION: "
85 f"{unique_count} unique chars in last {len(tail)}]"
86 )
87
88 logging.error(
89 "Extraction generation failed for %s: %s "
90 "(partial_text: %d chars, %d unique in tail%s) sample: %s",
91 name,
92 e,
93 length,
94 unique_count,
95 repetition_flag,
96 sample,
97 )
98
99
100def write_events_jsonl(
101 events: list[dict],
102 agent: str,
103 occurred: bool,
104 source_output: str,
105 capture_day: str,
106) -> list[Path]:
107 """Write events to facet-based JSONL files.
108
109 Groups events by facet and writes each to the appropriate file:
110 facets/{facet}/events/{event_day}.jsonl
111
112 Args:
113 events: List of event dictionaries from extraction.
114 agent: Source generator agent (e.g., "meetings", "schedule").
115 occurred: True for occurrences, False for anticipations.
116 source_output: Relative path to source output file.
117 capture_day: Day the output was captured (YYYYMMDD).
118
119 Returns:
120 List of paths to written JSONL files.
121 """
122 from think.utils import get_journal
123
124 journal = get_journal()
125
126 # Group events by (facet, event_day)
127 grouped: dict[tuple[str, str], list[dict]] = {}
128
129 for event in events:
130 facet = event.get("facet", "")
131 if not facet:
132 continue # Skip events without facet
133
134 # Determine the event day
135 if occurred:
136 # Occurrences use capture day
137 event_day = capture_day
138 else:
139 # Anticipations use their scheduled date
140 event_date = event.get("date", "")
141 # Convert YYYY-MM-DD to YYYYMMDD
142 event_day = event_date.replace("-", "") if event_date else capture_day
143
144 if not event_day:
145 continue
146
147 key = (facet, event_day)
148 if key not in grouped:
149 grouped[key] = []
150
151 # Enrich event with metadata
152 enriched = dict(event)
153 enriched["agent"] = agent
154 enriched["occurred"] = occurred
155 enriched["source"] = source_output
156
157 grouped[key].append(enriched)
158
159 # Write each group to its JSONL file
160 written_paths: list[Path] = []
161
162 for (facet, event_day), facet_events in grouped.items():
163 events_dir = Path(journal) / "facets" / facet / "events"
164 events_dir.mkdir(parents=True, exist_ok=True)
165
166 jsonl_path = events_dir / f"{event_day}.jsonl"
167 with open(jsonl_path, "a", encoding="utf-8") as f:
168 for event in facet_events:
169 f.write(json.dumps(event, ensure_ascii=False) + "\n")
170
171 written_paths.append(jsonl_path)
172
173 return written_paths
174
175
176def compute_output_source(context: dict) -> str:
177 """Compute relative source output path from hook context.
178
179 Args:
180 context: Hook context dict with day, segment, name, output_path, meta.
181
182 Returns:
183 Relative path like "20240101/agents/meetings.md".
184 """
185 from think.talent import get_output_name
186 from think.utils import get_journal
187
188 day = context.get("day", "")
189 output_path = context.get("output_path", "")
190 name = context.get("name", "unknown")
191 journal = get_journal()
192
193 try:
194 return os.path.relpath(output_path, journal)
195 except ValueError:
196 segment = context.get("segment")
197 output_name = get_output_name(name)
198 # Check for facet in meta (for multi-facet agents)
199 meta = context.get("meta", {})
200 facet = meta.get("facet") if meta else None
201 filename = f"{output_name}.md"
202 if segment and facet:
203 return os.path.join(day, segment, "agents", facet, filename)
204 if segment:
205 return os.path.join(day, segment, "agents", filename)
206 if facet:
207 return os.path.join(day, "agents", facet, filename)
208 return os.path.join(day, "agents", filename)