personal memory agent
at main 453 lines 16 kB view raw
1# SPDX-License-Identifier: AGPL-3.0-only 2# Copyright (c) 2026 sol pbc 3 4"""Formatters framework for JSONL and Markdown files. 5 6This module provides a registry-based system for converting structured files 7to markdown chunks. Each formatter is a plain function that lives near its 8source domain code. 9 10Supported file types: 11 - JSONL (.jsonl): Parsed as JSON lines, passed as list[dict] to formatter 12 - Markdown (.md): Read as text, passed as str to formatter 13 14Output contract: All formatters return tuple[list[dict], dict] where: 15 - list[dict]: Chunks, each with: 16 - markdown: str (formatted markdown for this chunk) 17 - timestamp: int (optional - unix timestamp in milliseconds for ordering) 18 - source: dict (optional - original entry from JSONL for enriched streams) 19 - dict: Metadata about the formatting with optional keys: 20 - header: str - Optional header markdown (metadata summary, context, etc.) 21 - error: str - Optional error/warning message (e.g., skipped entries) 22 - indexer: dict - Indexing metadata with keys: 23 - agent: str - Content type (e.g., "event", "audio", "screen") 24 JSONL formatters must provide agent. Markdown agent is path-derived. 25 Day and facet are extracted from path by extract_path_metadata(). 26 27JSONL formatters receive list[dict] entries and are responsible for: 28 - Extracting metadata from entries (typically first line) 29 - Building header from metadata if applicable 30 - Formatting content entries into chunks 31 - Providing indexer.agent in the meta dict 32 33Markdown formatters receive str text and perform semantic chunking. 34""" 35 36import argparse 37import fnmatch 38import json 39import os 40import sys 41from importlib import import_module 42from pathlib import Path 43from typing import Any, Callable 44 45from think.utils import DATE_RE, get_journal 46 47 48def extract_path_metadata(rel_path: str) -> dict[str, str]: 49 """Extract indexing metadata from a journal-relative path. 50 51 Extracts day and facet from path structure. For markdown files, also 52 derives agent from path. For JSONL files, agent should be provided 53 by the formatter via meta["indexer"]["agent"]. 54 55 Args: 56 rel_path: Journal-relative path (e.g., "20240101/agents/flow.md") 57 58 Returns: 59 Dict with keys: day, facet, agent 60 - day: YYYYMMDD string or empty 61 - facet: Facet name or empty 62 - agent: Derived agent for .md files, empty for .jsonl 63 """ 64 parts = rel_path.replace("\\", "/").split("/") 65 filename = parts[-1] 66 basename = os.path.splitext(filename)[0] 67 is_markdown = filename.endswith(".md") 68 69 day = "" 70 facet = "" 71 agent = "" 72 73 # Extract day from YYYYMMDD directory prefix 74 if parts[0] and DATE_RE.fullmatch(parts[0]): 75 day = parts[0] 76 77 # Extract facet from agents/{facet}/... paths 78 try: 79 agents_idx = parts.index("agents") 80 if agents_idx + 2 < len(parts): 81 facet = parts[agents_idx + 1] 82 except ValueError: 83 pass 84 85 # Extract facet from facets/{facet}/... paths 86 if parts[0] == "facets" and len(parts) >= 3: 87 facet = parts[1] 88 # Day from YYYYMMDD filename (events/entities/todos/news) 89 if len(parts) >= 4 and DATE_RE.fullmatch(basename): 90 day = basename 91 # Day from activities/{YYYYMMDD}/{activity_id}/... directory structure 92 elif ( 93 len(parts) >= 5 and parts[2] == "activities" and DATE_RE.fullmatch(parts[3]) 94 ): 95 day = parts[3] 96 97 # Extract day from imports/YYYYMMDD_HHMMSS/... 98 if parts[0] == "imports" and len(parts) >= 2: 99 import_id = parts[1] 100 day = import_id.split("_")[0] if "_" in import_id else import_id[:8] 101 102 # Extract day from config/actions/YYYYMMDD.jsonl (journal-level logs) 103 if parts[0] == "config" and len(parts) >= 3 and parts[1] == "actions": 104 if DATE_RE.fullmatch(basename): 105 day = basename 106 107 # Derive agent for markdown files only 108 if is_markdown: 109 if parts[0] == "facets" and len(parts) >= 4 and parts[2] == "news": 110 agent = "news" 111 elif parts[0] == "imports": 112 agent = "import" 113 elif parts[0] == "apps" and len(parts) >= 4: 114 agent = f"{parts[1]}:{basename}" 115 else: 116 # Daily agent outputs, segment markdown: use basename 117 agent = basename 118 119 return {"day": day, "facet": facet, "agent": agent} 120 121 122# Registry mapping glob patterns to (module_path, function_name, indexed). 123# Patterns are matched against journal-relative paths and must be specific 124# enough to use as Path.glob() arguments from the journal root. The indexed 125# flag controls whether find_formattable_files() collects matching files for 126# the search index. Adding a new journal content location requires a new 127# entry here — see docs/JOURNAL.md "Search Index" for details. 128# 129# Order matters: first match wins, so place specific patterns before general ones. 130FORMATTERS: dict[str, tuple[str, str, bool]] = { 131 # JSONL formatters (indexed) 132 "config/actions/*.jsonl": ("think.facets", "format_logs", True), 133 "facets/*/entities/*/observations.jsonl": ( 134 "think.entities.formatting", 135 "format_observations", 136 True, 137 ), 138 "facets/*/entities/*.jsonl": ("think.entities.formatting", "format_entities", True), 139 "entities/*/entity.json": ( 140 "think.entities.formatting", 141 "format_entity_identity", 142 False, # Indexed via _index_entity_search_chunks (enriched with relationship data) 143 ), 144 "facets/*/events/*.jsonl": ("think.events", "format_events", True), 145 "facets/*/calendar/*.jsonl": ("think.events", "format_events", True), 146 "facets/*/todos/*.jsonl": ("apps.todos.todo", "format_todos", True), 147 "facets/*/logs/*.jsonl": ("think.facets", "format_logs", True), 148 # Structured file imports (indexed) 149 "*/import.*/imported.jsonl": ( 150 "think.importers.formatting", 151 "format_imported", 152 True, 153 ), 154 # Markdown transcript imports (new convention + legacy) 155 "*/import.*/*/*_transcript.md": ("think.markdown", "format_markdown", True), 156 "*/import.*/*/imported.md": ("think.markdown", "format_markdown", True), 157 # AI chat imports — dedicated formatter (new + legacy filenames) 158 "*/import.chatgpt/*/conversation_transcript.jsonl": ( 159 "think.importers.formatting", 160 "format_ai_chat", 161 True, 162 ), 163 "*/import.claude/*/conversation_transcript.jsonl": ( 164 "think.importers.formatting", 165 "format_ai_chat", 166 True, 167 ), 168 "*/import.gemini/*/conversation_transcript.jsonl": ( 169 "think.importers.formatting", 170 "format_ai_chat", 171 True, 172 ), 173 # Legacy AI chat import filenames (backward compat) 174 "*/import.chatgpt/*/imported_audio.jsonl": ( 175 "think.importers.formatting", 176 "format_ai_chat", 177 True, 178 ), 179 "*/import.claude/*/imported_audio.jsonl": ( 180 "think.importers.formatting", 181 "format_ai_chat", 182 True, 183 ), 184 "*/import.gemini/*/imported_audio.jsonl": ( 185 "think.importers.formatting", 186 "format_ai_chat", 187 True, 188 ), 189 # Raw transcripts — formattable but not indexed (agent outputs are more useful) 190 # Layout: day/stream/segment/audio.jsonl 191 "*/*/*/audio.jsonl": ("observe.hear", "format_audio", False), 192 "*/*/*/*_audio.jsonl": ("observe.hear", "format_audio", False), 193 "*/*/*/*_transcript.jsonl": ("observe.hear", "format_audio", False), 194 "*/*/*/screen.jsonl": ("observe.screen", "format_screen", False), 195 "*/*/*/*_screen.jsonl": ("observe.screen", "format_screen", False), 196 # Markdown — day-level agents output and segment-level (day/stream/segment/agents/) 197 "*/agents/*.md": ("think.markdown", "format_markdown", True), 198 # Layout: day/stream/segment/agents/*.md 199 "*/*/*/agents/*.md": ("think.markdown", "format_markdown", True), 200 "*/*/*/agents/*/*.md": ("think.markdown", "format_markdown", True), 201 "facets/*/activities/*/*/*.md": ("think.markdown", "format_markdown", True), 202 "facets/*/news/*.md": ("think.markdown", "format_markdown", True), 203 "imports/*/summary.md": ("think.markdown", "format_markdown", True), 204 "apps/*/agents/*.md": ("think.markdown", "format_markdown", True), 205} 206 207 208def get_formatter(file_path: str) -> Callable | None: 209 """Return formatter function for a journal-relative file path. 210 211 Matches against registered glob patterns (regardless of indexed flag). 212 213 Args: 214 file_path: Journal-relative path (e.g., "20240101/agents/flow.md") 215 216 Returns: 217 Formatter function or None if no pattern matches 218 """ 219 for pattern, (module_path, func_name, _indexed) in FORMATTERS.items(): 220 if fnmatch.fnmatch(file_path, pattern): 221 module = import_module(module_path) 222 return getattr(module, func_name) 223 224 return None 225 226 227def load_jsonl(file_path: str | Path) -> list[dict[str, Any]]: 228 """Load entries from a JSONL file. 229 230 Args: 231 file_path: Absolute path to JSONL file 232 233 Returns: 234 List of parsed JSON objects (one per line) 235 """ 236 entries = [] 237 with open(file_path, "r", encoding="utf-8") as f: 238 for line in f: 239 line = line.strip() 240 if not line: 241 continue 242 try: 243 entries.append(json.loads(line)) 244 except json.JSONDecodeError: 245 continue 246 return entries 247 248 249def load_markdown(file_path: str | Path) -> str: 250 """Load text from a markdown file. 251 252 Args: 253 file_path: Absolute path to markdown file 254 255 Returns: 256 File contents as string 257 """ 258 with open(file_path, "r", encoding="utf-8") as f: 259 return f.read() 260 261 262def find_formattable_files(journal: str) -> dict[str, str]: 263 """Find all indexable files in the journal. 264 265 Globs each indexed FORMATTERS pattern from the journal root to discover 266 files. The registry is the single source of truth for what gets indexed. 267 268 Args: 269 journal: Path to journal root directory 270 271 Returns: 272 Mapping of journal-relative paths to absolute paths 273 """ 274 files: dict[str, str] = {} 275 journal_path = Path(journal) 276 277 for pattern, (_mod, _func, indexed) in FORMATTERS.items(): 278 if not indexed: 279 continue 280 for match in journal_path.glob(pattern): 281 if match.is_file(): 282 rel = str(match.relative_to(journal_path)) 283 files[rel] = str(match) 284 285 return files 286 287 288def format_file( 289 file_path: str | Path, 290 context: dict[str, Any] | None = None, 291) -> tuple[list[dict[str, Any]], dict[str, Any]]: 292 """Load file, detect formatter, return formatted chunks and metadata. 293 294 File must be under the journal root. Supports JSONL, JSON, and Markdown files. 295 296 Args: 297 file_path: Absolute or journal-relative path to file 298 context: Optional context dict passed to formatter 299 300 Returns: 301 Tuple of (chunks, meta) where: 302 - chunks: List of dicts with "markdown" key (and optional "timestamp") 303 - meta: Dict with optional "header" and "error" keys 304 305 Raises: 306 ValueError: If file is outside journal or no formatter found 307 FileNotFoundError: If file doesn't exist 308 """ 309 journal_path = Path(get_journal()).resolve() 310 file_path = Path(file_path).resolve() 311 312 if not file_path.exists(): 313 raise FileNotFoundError(f"File not found: {file_path}") 314 315 # Require file to be under journal 316 if not file_path.is_relative_to(journal_path): 317 raise ValueError(f"File is outside journal directory: {file_path}") 318 319 rel_path = str(file_path.relative_to(journal_path)) 320 321 formatter = get_formatter(rel_path) 322 if formatter is None: 323 raise ValueError(f"No formatter found for: {rel_path}") 324 325 # Load file based on extension 326 if file_path.suffix == ".md": 327 content = load_markdown(file_path) 328 elif file_path.suffix == ".json": 329 with open(file_path, encoding="utf-8") as f: 330 content = [json.load(f)] 331 else: 332 content = load_jsonl(file_path) 333 334 # Build context with file path info 335 ctx = context or {} 336 ctx.setdefault("file_path", file_path) 337 338 return formatter(content, ctx) 339 340 341def _format_chunk_summary(chunks: list[dict], raw_chunks: list[dict] | None) -> None: 342 """Print human-readable chunk summary (for markdown files with raw chunks).""" 343 print(f"Total chunks: {len(chunks)}\n") 344 for i, chunk in enumerate(chunks): 345 # Use raw chunk data if available, otherwise extract from markdown 346 if raw_chunks and i < len(raw_chunks): 347 c = raw_chunks[i] 348 chunk_type = c.get("type", "unknown") 349 header_path = c.get("header_path", []) 350 intro = c.get("intro") 351 preview = c.get("preview", "") 352 else: 353 chunk_type = "chunk" 354 header_path = [] 355 intro = None 356 preview = chunk.get("markdown", "")[:70] 357 358 path = " > ".join(f"H{h['level']}:{h['text']}" for h in header_path) 359 print(f"#{i:3d} [{chunk_type:13s}]") 360 if path: 361 print(f" path: {path}") 362 if intro: 363 print(f' intro: "{intro[:60]}{"..." if len(intro) > 60 else ""}"') 364 print(f" {preview[:70]}{'...' if len(preview) > 70 else ''}") 365 print() 366 367 368def main() -> None: 369 """CLI entry point for sol formatter.""" 370 from think.utils import setup_cli 371 372 parser = argparse.ArgumentParser( 373 description="Convert JSONL or Markdown files to formatted chunks" 374 ) 375 parser.add_argument("file", help="Path to JSONL or Markdown file") 376 parser.add_argument( 377 "-f", 378 "--format", 379 choices=["json", "markdown", "summary"], 380 default="json", 381 help="Output format (default: json)", 382 ) 383 parser.add_argument( 384 "-i", 385 "--index", 386 type=int, 387 help="Show only the chunk at this index", 388 ) 389 parser.add_argument( 390 "--join", 391 action="store_true", 392 help="Output concatenated markdown (shorthand for --format=markdown)", 393 ) 394 parser.add_argument( 395 "--context", 396 type=str, 397 help="JSON string of context to pass to formatter", 398 ) 399 args = setup_cli(parser) 400 401 # --join is shorthand for --format=markdown 402 if args.join: 403 args.format = "markdown" 404 405 try: 406 context = json.loads(args.context) if args.context else None 407 except json.JSONDecodeError as e: 408 print(f"Error parsing context JSON: {e}", file=sys.stderr) 409 sys.exit(1) 410 411 try: 412 chunks, meta = format_file(args.file, context) 413 except (ValueError, FileNotFoundError) as e: 414 print(f"Error: {e}", file=sys.stderr) 415 sys.exit(1) 416 417 # For summary format on markdown files, get raw chunks with metadata 418 raw_chunks = None 419 if args.format == "summary" and args.file.endswith(".md"): 420 from think.markdown import chunk_markdown 421 422 text = load_markdown(args.file) 423 raw_chunks = chunk_markdown(text) 424 425 # Filter to single chunk if requested 426 if args.index is not None: 427 if 0 <= args.index < len(chunks): 428 chunks = [chunks[args.index]] 429 if raw_chunks: 430 raw_chunks = [raw_chunks[args.index]] 431 else: 432 print( 433 f"Error: Index {args.index} out of range (0-{len(chunks) - 1})", 434 file=sys.stderr, 435 ) 436 sys.exit(1) 437 438 if args.format == "markdown": 439 # Output concatenated markdown with header first 440 parts = [] 441 if meta.get("header"): 442 parts.append(meta["header"]) 443 parts.extend(chunk["markdown"] for chunk in chunks) 444 print("\n".join(parts)) 445 elif args.format == "summary": 446 _format_chunk_summary(chunks, raw_chunks) 447 else: 448 # Output JSON object with metadata and chunks 449 print(json.dumps({"meta": meta, "chunks": chunks}, indent=2)) 450 451 452if __name__ == "__main__": 453 main()