think/hooks.py at main · solpbc.org/solstone

solpbc.org / solstone
fork atom
personal memory agent
fork atom
solstone / think / hooks.py
at main 208 lines 6.4 kB view raw
wrap content
Jer Miller refactor: rename muse → talent project-wide 13hrs ago
cb19a905
  1# SPDX-License-Identifier: AGPL-3.0-only
  2# Copyright (c) 2026 sol pbc
  3
  4"""Shared utilities for output extraction hooks.
  5
  6This module provides common functions used by extraction hooks like
  7occurrence.py and anticipation.py in the talent/ directory.
  8"""
  9
 10import json
 11import logging
 12import os
 13from pathlib import Path
 14
 15# Minimum content length for meaningful event extraction
 16MIN_EXTRACTION_CHARS = 50
 17
 18
 19def should_skip_extraction(result: str, context: dict) -> str | None:
 20    """Check if extraction should be skipped and return reason, or None to proceed.
 21
 22    Args:
 23        result: The generated output markdown content.
 24        context: Hook context dict with meta and span.
 25
 26    Returns:
 27        Skip reason string if extraction should be skipped, None otherwise.
 28    """
 29    meta = context.get("meta", {})
 30
 31    # Skip if extraction disabled via journal config
 32    if meta.get("extract") is False:
 33        return "extraction disabled via journal config"
 34
 35    # Skip for JSON output (output IS the structured data)
 36    if meta.get("output") == "json":
 37        return "JSON output (already structured)"
 38
 39    # Skip in span mode (multiple sequential segments)
 40    if context.get("span"):
 41        return "span mode"
 42
 43    # Skip for minimal content
 44    if len(result.strip()) < MIN_EXTRACTION_CHARS:
 45        return f"minimal content ({len(result.strip())} chars < {MIN_EXTRACTION_CHARS})"
 46
 47    return None
 48
 49
 50def log_extraction_failure(e: Exception, name: str) -> None:
 51    """Log enhanced diagnostics for extraction generation failures.
 52
 53    Handles IncompleteJSONError specially by logging a single-line summary
 54    with a head+tail sample and degenerate repetition detection.
 55
 56    Args:
 57        e: The exception from generate().
 58        name: Generator name for log context.
 59    """
 60    from think.models import IncompleteJSONError
 61
 62    if not isinstance(e, IncompleteJSONError):
 63        logging.error("Extraction generation failed for %s: %s", name, e)
 64        return
 65
 66    partial = e.partial_text
 67    length = len(partial)
 68
 69    # Build single-line head+tail sample (newlines collapsed for log grep)
 70    def _collapse(s: str) -> str:
 71        return s.replace("\n", "\\n").replace("\r", "")
 72
 73    if length <= 300:
 74        sample = _collapse(partial)
 75    else:
 76        sample = f"{_collapse(partial[:150])} ... {_collapse(partial[-150:])}"
 77
 78    # Repetition detection: count unique chars in last 1000
 79    tail = partial[-1000:] if length >= 1000 else partial
 80    unique_count = len(set(tail))
 81    repetition_flag = ""
 82    if unique_count < 20:
 83        repetition_flag = (
 84            f" [POSSIBLE DEGENERATE REPETITION: "
 85            f"{unique_count} unique chars in last {len(tail)}]"
 86        )
 87
 88    logging.error(
 89        "Extraction generation failed for %s: %s "
 90        "(partial_text: %d chars, %d unique in tail%s) sample: %s",
 91        name,
 92        e,
 93        length,
 94        unique_count,
 95        repetition_flag,
 96        sample,
 97    )
 98
 99
100def write_events_jsonl(
101    events: list[dict],
102    agent: str,
103    occurred: bool,
104    source_output: str,
105    capture_day: str,
106) -> list[Path]:
107    """Write events to facet-based JSONL files.
108
109    Groups events by facet and writes each to the appropriate file:
110    facets/{facet}/events/{event_day}.jsonl
111
112    Args:
113        events: List of event dictionaries from extraction.
114        agent: Source generator agent (e.g., "meetings", "schedule").
115        occurred: True for occurrences, False for anticipations.
116        source_output: Relative path to source output file.
117        capture_day: Day the output was captured (YYYYMMDD).
118
119    Returns:
120        List of paths to written JSONL files.
121    """
122    from think.utils import get_journal
123
124    journal = get_journal()
125
126    # Group events by (facet, event_day)
127    grouped: dict[tuple[str, str], list[dict]] = {}
128
129    for event in events:
130        facet = event.get("facet", "")
131        if not facet:
132            continue  # Skip events without facet
133
134        # Determine the event day
135        if occurred:
136            # Occurrences use capture day
137            event_day = capture_day
138        else:
139            # Anticipations use their scheduled date
140            event_date = event.get("date", "")
141            # Convert YYYY-MM-DD to YYYYMMDD
142            event_day = event_date.replace("-", "") if event_date else capture_day
143
144        if not event_day:
145            continue
146
147        key = (facet, event_day)
148        if key not in grouped:
149            grouped[key] = []
150
151        # Enrich event with metadata
152        enriched = dict(event)
153        enriched["agent"] = agent
154        enriched["occurred"] = occurred
155        enriched["source"] = source_output
156
157        grouped[key].append(enriched)
158
159    # Write each group to its JSONL file
160    written_paths: list[Path] = []
161
162    for (facet, event_day), facet_events in grouped.items():
163        events_dir = Path(journal) / "facets" / facet / "events"
164        events_dir.mkdir(parents=True, exist_ok=True)
165
166        jsonl_path = events_dir / f"{event_day}.jsonl"
167        with open(jsonl_path, "a", encoding="utf-8") as f:
168            for event in facet_events:
169                f.write(json.dumps(event, ensure_ascii=False) + "\n")
170
171        written_paths.append(jsonl_path)
172
173    return written_paths
174
175
176def compute_output_source(context: dict) -> str:
177    """Compute relative source output path from hook context.
178
179    Args:
180        context: Hook context dict with day, segment, name, output_path, meta.
181
182    Returns:
183        Relative path like "20240101/agents/meetings.md".
184    """
185    from think.talent import get_output_name
186    from think.utils import get_journal
187
188    day = context.get("day", "")
189    output_path = context.get("output_path", "")
190    name = context.get("name", "unknown")
191    journal = get_journal()
192
193    try:
194        return os.path.relpath(output_path, journal)
195    except ValueError:
196        segment = context.get("segment")
197        output_name = get_output_name(name)
198        # Check for facet in meta (for multi-facet agents)
199        meta = context.get("meta", {})
200        facet = meta.get("facet") if meta else None
201        filename = f"{output_name}.md"
202        if segment and facet:
203            return os.path.join(day, segment, "agents", facet, filename)
204        if segment:
205            return os.path.join(day, segment, "agents", filename)
206        if facet:
207            return os.path.join(day, "agents", facet, filename)
208        return os.path.join(day, "agents", filename)