think/formatters.py at main · solpbc.org/solstone

solpbc.org / solstone
fork atom
personal memory agent
fork atom
solstone / think / formatters.py
at main 453 lines 16 kB view raw
wrap content
Jer Miller Canonicalize journal path to ./journal/ from project root 19d ago
449fb109
  1# SPDX-License-Identifier: AGPL-3.0-only
  2# Copyright (c) 2026 sol pbc
  3
  4"""Formatters framework for JSONL and Markdown files.
  5
  6This module provides a registry-based system for converting structured files
  7to markdown chunks. Each formatter is a plain function that lives near its
  8source domain code.
  9
 10Supported file types:
 11    - JSONL (.jsonl): Parsed as JSON lines, passed as list[dict] to formatter
 12    - Markdown (.md): Read as text, passed as str to formatter
 13
 14Output contract: All formatters return tuple[list[dict], dict] where:
 15    - list[dict]: Chunks, each with:
 16        - markdown: str (formatted markdown for this chunk)
 17        - timestamp: int (optional - unix timestamp in milliseconds for ordering)
 18        - source: dict (optional - original entry from JSONL for enriched streams)
 19    - dict: Metadata about the formatting with optional keys:
 20        - header: str - Optional header markdown (metadata summary, context, etc.)
 21        - error: str - Optional error/warning message (e.g., skipped entries)
 22        - indexer: dict - Indexing metadata with keys:
 23            - agent: str - Content type (e.g., "event", "audio", "screen")
 24            JSONL formatters must provide agent. Markdown agent is path-derived.
 25            Day and facet are extracted from path by extract_path_metadata().
 26
 27JSONL formatters receive list[dict] entries and are responsible for:
 28    - Extracting metadata from entries (typically first line)
 29    - Building header from metadata if applicable
 30    - Formatting content entries into chunks
 31    - Providing indexer.agent in the meta dict
 32
 33Markdown formatters receive str text and perform semantic chunking.
 34"""
 35
 36import argparse
 37import fnmatch
 38import json
 39import os
 40import sys
 41from importlib import import_module
 42from pathlib import Path
 43from typing import Any, Callable
 44
 45from think.utils import DATE_RE, get_journal
 46
 47
 48def extract_path_metadata(rel_path: str) -> dict[str, str]:
 49    """Extract indexing metadata from a journal-relative path.
 50
 51    Extracts day and facet from path structure. For markdown files, also
 52    derives agent from path. For JSONL files, agent should be provided
 53    by the formatter via meta["indexer"]["agent"].
 54
 55    Args:
 56        rel_path: Journal-relative path (e.g., "20240101/agents/flow.md")
 57
 58    Returns:
 59        Dict with keys: day, facet, agent
 60        - day: YYYYMMDD string or empty
 61        - facet: Facet name or empty
 62        - agent: Derived agent for .md files, empty for .jsonl
 63    """
 64    parts = rel_path.replace("\\", "/").split("/")
 65    filename = parts[-1]
 66    basename = os.path.splitext(filename)[0]
 67    is_markdown = filename.endswith(".md")
 68
 69    day = ""
 70    facet = ""
 71    agent = ""
 72
 73    # Extract day from YYYYMMDD directory prefix
 74    if parts[0] and DATE_RE.fullmatch(parts[0]):
 75        day = parts[0]
 76
 77    # Extract facet from agents/{facet}/... paths
 78    try:
 79        agents_idx = parts.index("agents")
 80        if agents_idx + 2 < len(parts):
 81            facet = parts[agents_idx + 1]
 82    except ValueError:
 83        pass
 84
 85    # Extract facet from facets/{facet}/... paths
 86    if parts[0] == "facets" and len(parts) >= 3:
 87        facet = parts[1]
 88        # Day from YYYYMMDD filename (events/entities/todos/news)
 89        if len(parts) >= 4 and DATE_RE.fullmatch(basename):
 90            day = basename
 91        # Day from activities/{YYYYMMDD}/{activity_id}/... directory structure
 92        elif (
 93            len(parts) >= 5 and parts[2] == "activities" and DATE_RE.fullmatch(parts[3])
 94        ):
 95            day = parts[3]
 96
 97    # Extract day from imports/YYYYMMDD_HHMMSS/...
 98    if parts[0] == "imports" and len(parts) >= 2:
 99        import_id = parts[1]
100        day = import_id.split("_")[0] if "_" in import_id else import_id[:8]
101
102    # Extract day from config/actions/YYYYMMDD.jsonl (journal-level logs)
103    if parts[0] == "config" and len(parts) >= 3 and parts[1] == "actions":
104        if DATE_RE.fullmatch(basename):
105            day = basename
106
107    # Derive agent for markdown files only
108    if is_markdown:
109        if parts[0] == "facets" and len(parts) >= 4 and parts[2] == "news":
110            agent = "news"
111        elif parts[0] == "imports":
112            agent = "import"
113        elif parts[0] == "apps" and len(parts) >= 4:
114            agent = f"{parts[1]}:{basename}"
115        else:
116            # Daily agent outputs, segment markdown: use basename
117            agent = basename
118
119    return {"day": day, "facet": facet, "agent": agent}
120
121
122# Registry mapping glob patterns to (module_path, function_name, indexed).
123# Patterns are matched against journal-relative paths and must be specific
124# enough to use as Path.glob() arguments from the journal root.  The indexed
125# flag controls whether find_formattable_files() collects matching files for
126# the search index.  Adding a new journal content location requires a new
127# entry here — see docs/JOURNAL.md "Search Index" for details.
128#
129# Order matters: first match wins, so place specific patterns before general ones.
130FORMATTERS: dict[str, tuple[str, str, bool]] = {
131    # JSONL formatters (indexed)
132    "config/actions/*.jsonl": ("think.facets", "format_logs", True),
133    "facets/*/entities/*/observations.jsonl": (
134        "think.entities.formatting",
135        "format_observations",
136        True,
137    ),
138    "facets/*/entities/*.jsonl": ("think.entities.formatting", "format_entities", True),
139    "entities/*/entity.json": (
140        "think.entities.formatting",
141        "format_entity_identity",
142        False,  # Indexed via _index_entity_search_chunks (enriched with relationship data)
143    ),
144    "facets/*/events/*.jsonl": ("think.events", "format_events", True),
145    "facets/*/calendar/*.jsonl": ("think.events", "format_events", True),
146    "facets/*/todos/*.jsonl": ("apps.todos.todo", "format_todos", True),
147    "facets/*/logs/*.jsonl": ("think.facets", "format_logs", True),
148    # Structured file imports (indexed)
149    "*/import.*/imported.jsonl": (
150        "think.importers.formatting",
151        "format_imported",
152        True,
153    ),
154    # Markdown transcript imports (new convention + legacy)
155    "*/import.*/*/*_transcript.md": ("think.markdown", "format_markdown", True),
156    "*/import.*/*/imported.md": ("think.markdown", "format_markdown", True),
157    # AI chat imports — dedicated formatter (new + legacy filenames)
158    "*/import.chatgpt/*/conversation_transcript.jsonl": (
159        "think.importers.formatting",
160        "format_ai_chat",
161        True,
162    ),
163    "*/import.claude/*/conversation_transcript.jsonl": (
164        "think.importers.formatting",
165        "format_ai_chat",
166        True,
167    ),
168    "*/import.gemini/*/conversation_transcript.jsonl": (
169        "think.importers.formatting",
170        "format_ai_chat",
171        True,
172    ),
173    # Legacy AI chat import filenames (backward compat)
174    "*/import.chatgpt/*/imported_audio.jsonl": (
175        "think.importers.formatting",
176        "format_ai_chat",
177        True,
178    ),
179    "*/import.claude/*/imported_audio.jsonl": (
180        "think.importers.formatting",
181        "format_ai_chat",
182        True,
183    ),
184    "*/import.gemini/*/imported_audio.jsonl": (
185        "think.importers.formatting",
186        "format_ai_chat",
187        True,
188    ),
189    # Raw transcripts — formattable but not indexed (agent outputs are more useful)
190    # Layout: day/stream/segment/audio.jsonl
191    "*/*/*/audio.jsonl": ("observe.hear", "format_audio", False),
192    "*/*/*/*_audio.jsonl": ("observe.hear", "format_audio", False),
193    "*/*/*/*_transcript.jsonl": ("observe.hear", "format_audio", False),
194    "*/*/*/screen.jsonl": ("observe.screen", "format_screen", False),
195    "*/*/*/*_screen.jsonl": ("observe.screen", "format_screen", False),
196    # Markdown — day-level agents output and segment-level (day/stream/segment/agents/)
197    "*/agents/*.md": ("think.markdown", "format_markdown", True),
198    # Layout: day/stream/segment/agents/*.md
199    "*/*/*/agents/*.md": ("think.markdown", "format_markdown", True),
200    "*/*/*/agents/*/*.md": ("think.markdown", "format_markdown", True),
201    "facets/*/activities/*/*/*.md": ("think.markdown", "format_markdown", True),
202    "facets/*/news/*.md": ("think.markdown", "format_markdown", True),
203    "imports/*/summary.md": ("think.markdown", "format_markdown", True),
204    "apps/*/agents/*.md": ("think.markdown", "format_markdown", True),
205}
206
207
208def get_formatter(file_path: str) -> Callable | None:
209    """Return formatter function for a journal-relative file path.
210
211    Matches against registered glob patterns (regardless of indexed flag).
212
213    Args:
214        file_path: Journal-relative path (e.g., "20240101/agents/flow.md")
215
216    Returns:
217        Formatter function or None if no pattern matches
218    """
219    for pattern, (module_path, func_name, _indexed) in FORMATTERS.items():
220        if fnmatch.fnmatch(file_path, pattern):
221            module = import_module(module_path)
222            return getattr(module, func_name)
223
224    return None
225
226
227def load_jsonl(file_path: str | Path) -> list[dict[str, Any]]:
228    """Load entries from a JSONL file.
229
230    Args:
231        file_path: Absolute path to JSONL file
232
233    Returns:
234        List of parsed JSON objects (one per line)
235    """
236    entries = []
237    with open(file_path, "r", encoding="utf-8") as f:
238        for line in f:
239            line = line.strip()
240            if not line:
241                continue
242            try:
243                entries.append(json.loads(line))
244            except json.JSONDecodeError:
245                continue
246    return entries
247
248
249def load_markdown(file_path: str | Path) -> str:
250    """Load text from a markdown file.
251
252    Args:
253        file_path: Absolute path to markdown file
254
255    Returns:
256        File contents as string
257    """
258    with open(file_path, "r", encoding="utf-8") as f:
259        return f.read()
260
261
262def find_formattable_files(journal: str) -> dict[str, str]:
263    """Find all indexable files in the journal.
264
265    Globs each indexed FORMATTERS pattern from the journal root to discover
266    files.  The registry is the single source of truth for what gets indexed.
267
268    Args:
269        journal: Path to journal root directory
270
271    Returns:
272        Mapping of journal-relative paths to absolute paths
273    """
274    files: dict[str, str] = {}
275    journal_path = Path(journal)
276
277    for pattern, (_mod, _func, indexed) in FORMATTERS.items():
278        if not indexed:
279            continue
280        for match in journal_path.glob(pattern):
281            if match.is_file():
282                rel = str(match.relative_to(journal_path))
283                files[rel] = str(match)
284
285    return files
286
287
288def format_file(
289    file_path: str | Path,
290    context: dict[str, Any] | None = None,
291) -> tuple[list[dict[str, Any]], dict[str, Any]]:
292    """Load file, detect formatter, return formatted chunks and metadata.
293
294    File must be under the journal root. Supports JSONL, JSON, and Markdown files.
295
296    Args:
297        file_path: Absolute or journal-relative path to file
298        context: Optional context dict passed to formatter
299
300    Returns:
301        Tuple of (chunks, meta) where:
302            - chunks: List of dicts with "markdown" key (and optional "timestamp")
303            - meta: Dict with optional "header" and "error" keys
304
305    Raises:
306        ValueError: If file is outside journal or no formatter found
307        FileNotFoundError: If file doesn't exist
308    """
309    journal_path = Path(get_journal()).resolve()
310    file_path = Path(file_path).resolve()
311
312    if not file_path.exists():
313        raise FileNotFoundError(f"File not found: {file_path}")
314
315    # Require file to be under journal
316    if not file_path.is_relative_to(journal_path):
317        raise ValueError(f"File is outside journal directory: {file_path}")
318
319    rel_path = str(file_path.relative_to(journal_path))
320
321    formatter = get_formatter(rel_path)
322    if formatter is None:
323        raise ValueError(f"No formatter found for: {rel_path}")
324
325    # Load file based on extension
326    if file_path.suffix == ".md":
327        content = load_markdown(file_path)
328    elif file_path.suffix == ".json":
329        with open(file_path, encoding="utf-8") as f:
330            content = [json.load(f)]
331    else:
332        content = load_jsonl(file_path)
333
334    # Build context with file path info
335    ctx = context or {}
336    ctx.setdefault("file_path", file_path)
337
338    return formatter(content, ctx)
339
340
341def _format_chunk_summary(chunks: list[dict], raw_chunks: list[dict] | None) -> None:
342    """Print human-readable chunk summary (for markdown files with raw chunks)."""
343    print(f"Total chunks: {len(chunks)}\n")
344    for i, chunk in enumerate(chunks):
345        # Use raw chunk data if available, otherwise extract from markdown
346        if raw_chunks and i < len(raw_chunks):
347            c = raw_chunks[i]
348            chunk_type = c.get("type", "unknown")
349            header_path = c.get("header_path", [])
350            intro = c.get("intro")
351            preview = c.get("preview", "")
352        else:
353            chunk_type = "chunk"
354            header_path = []
355            intro = None
356            preview = chunk.get("markdown", "")[:70]
357
358        path = " > ".join(f"H{h['level']}:{h['text']}" for h in header_path)
359        print(f"#{i:3d} [{chunk_type:13s}]")
360        if path:
361            print(f"      path: {path}")
362        if intro:
363            print(f'      intro: "{intro[:60]}{"..." if len(intro) > 60 else ""}"')
364        print(f"      {preview[:70]}{'...' if len(preview) > 70 else ''}")
365        print()
366
367
368def main() -> None:
369    """CLI entry point for sol formatter."""
370    from think.utils import setup_cli
371
372    parser = argparse.ArgumentParser(
373        description="Convert JSONL or Markdown files to formatted chunks"
374    )
375    parser.add_argument("file", help="Path to JSONL or Markdown file")
376    parser.add_argument(
377        "-f",
378        "--format",
379        choices=["json", "markdown", "summary"],
380        default="json",
381        help="Output format (default: json)",
382    )
383    parser.add_argument(
384        "-i",
385        "--index",
386        type=int,
387        help="Show only the chunk at this index",
388    )
389    parser.add_argument(
390        "--join",
391        action="store_true",
392        help="Output concatenated markdown (shorthand for --format=markdown)",
393    )
394    parser.add_argument(
395        "--context",
396        type=str,
397        help="JSON string of context to pass to formatter",
398    )
399    args = setup_cli(parser)
400
401    # --join is shorthand for --format=markdown
402    if args.join:
403        args.format = "markdown"
404
405    try:
406        context = json.loads(args.context) if args.context else None
407    except json.JSONDecodeError as e:
408        print(f"Error parsing context JSON: {e}", file=sys.stderr)
409        sys.exit(1)
410
411    try:
412        chunks, meta = format_file(args.file, context)
413    except (ValueError, FileNotFoundError) as e:
414        print(f"Error: {e}", file=sys.stderr)
415        sys.exit(1)
416
417    # For summary format on markdown files, get raw chunks with metadata
418    raw_chunks = None
419    if args.format == "summary" and args.file.endswith(".md"):
420        from think.markdown import chunk_markdown
421
422        text = load_markdown(args.file)
423        raw_chunks = chunk_markdown(text)
424
425    # Filter to single chunk if requested
426    if args.index is not None:
427        if 0 <= args.index < len(chunks):
428            chunks = [chunks[args.index]]
429            if raw_chunks:
430                raw_chunks = [raw_chunks[args.index]]
431        else:
432            print(
433                f"Error: Index {args.index} out of range (0-{len(chunks) - 1})",
434                file=sys.stderr,
435            )
436            sys.exit(1)
437
438    if args.format == "markdown":
439        # Output concatenated markdown with header first
440        parts = []
441        if meta.get("header"):
442            parts.append(meta["header"])
443        parts.extend(chunk["markdown"] for chunk in chunks)
444        print("\n".join(parts))
445    elif args.format == "summary":
446        _format_chunk_summary(chunks, raw_chunks)
447    else:
448        # Output JSON object with metadata and chunks
449        print(json.dumps({"meta": meta, "chunks": chunks}, indent=2))
450
451
452if __name__ == "__main__":
453    main()