personal memory agent
1# SPDX-License-Identifier: AGPL-3.0-only
2# Copyright (c) 2026 sol pbc
3
4"""Formatters framework for JSONL and Markdown files.
5
6This module provides a registry-based system for converting structured files
7to markdown chunks. Each formatter is a plain function that lives near its
8source domain code.
9
10Supported file types:
11 - JSONL (.jsonl): Parsed as JSON lines, passed as list[dict] to formatter
12 - Markdown (.md): Read as text, passed as str to formatter
13
14Output contract: All formatters return tuple[list[dict], dict] where:
15 - list[dict]: Chunks, each with:
16 - markdown: str (formatted markdown for this chunk)
17 - timestamp: int (optional - unix timestamp in milliseconds for ordering)
18 - source: dict (optional - original entry from JSONL for enriched streams)
19 - dict: Metadata about the formatting with optional keys:
20 - header: str - Optional header markdown (metadata summary, context, etc.)
21 - error: str - Optional error/warning message (e.g., skipped entries)
22 - indexer: dict - Indexing metadata with keys:
23 - agent: str - Content type (e.g., "event", "audio", "screen")
24 JSONL formatters must provide agent. Markdown agent is path-derived.
25 Day and facet are extracted from path by extract_path_metadata().
26
27JSONL formatters receive list[dict] entries and are responsible for:
28 - Extracting metadata from entries (typically first line)
29 - Building header from metadata if applicable
30 - Formatting content entries into chunks
31 - Providing indexer.agent in the meta dict
32
33Markdown formatters receive str text and perform semantic chunking.
34"""
35
36import argparse
37import fnmatch
38import json
39import os
40import sys
41from importlib import import_module
42from pathlib import Path
43from typing import Any, Callable
44
45from think.utils import DATE_RE, get_journal
46
47
48def extract_path_metadata(rel_path: str) -> dict[str, str]:
49 """Extract indexing metadata from a journal-relative path.
50
51 Extracts day and facet from path structure. For markdown files, also
52 derives agent from path. For JSONL files, agent should be provided
53 by the formatter via meta["indexer"]["agent"].
54
55 Args:
56 rel_path: Journal-relative path (e.g., "20240101/agents/flow.md")
57
58 Returns:
59 Dict with keys: day, facet, agent
60 - day: YYYYMMDD string or empty
61 - facet: Facet name or empty
62 - agent: Derived agent for .md files, empty for .jsonl
63 """
64 parts = rel_path.replace("\\", "/").split("/")
65 filename = parts[-1]
66 basename = os.path.splitext(filename)[0]
67 is_markdown = filename.endswith(".md")
68
69 day = ""
70 facet = ""
71 agent = ""
72
73 # Extract day from YYYYMMDD directory prefix
74 if parts[0] and DATE_RE.fullmatch(parts[0]):
75 day = parts[0]
76
77 # Extract facet from agents/{facet}/... paths
78 try:
79 agents_idx = parts.index("agents")
80 if agents_idx + 2 < len(parts):
81 facet = parts[agents_idx + 1]
82 except ValueError:
83 pass
84
85 # Extract facet from facets/{facet}/... paths
86 if parts[0] == "facets" and len(parts) >= 3:
87 facet = parts[1]
88 # Day from YYYYMMDD filename (events/entities/todos/news)
89 if len(parts) >= 4 and DATE_RE.fullmatch(basename):
90 day = basename
91 # Day from activities/{YYYYMMDD}/{activity_id}/... directory structure
92 elif (
93 len(parts) >= 5 and parts[2] == "activities" and DATE_RE.fullmatch(parts[3])
94 ):
95 day = parts[3]
96
97 # Extract day from imports/YYYYMMDD_HHMMSS/...
98 if parts[0] == "imports" and len(parts) >= 2:
99 import_id = parts[1]
100 day = import_id.split("_")[0] if "_" in import_id else import_id[:8]
101
102 # Extract day from config/actions/YYYYMMDD.jsonl (journal-level logs)
103 if parts[0] == "config" and len(parts) >= 3 and parts[1] == "actions":
104 if DATE_RE.fullmatch(basename):
105 day = basename
106
107 # Derive agent for markdown files only
108 if is_markdown:
109 if parts[0] == "facets" and len(parts) >= 4 and parts[2] == "news":
110 agent = "news"
111 elif parts[0] == "imports":
112 agent = "import"
113 elif parts[0] == "apps" and len(parts) >= 4:
114 agent = f"{parts[1]}:{basename}"
115 else:
116 # Daily agent outputs, segment markdown: use basename
117 agent = basename
118
119 return {"day": day, "facet": facet, "agent": agent}
120
121
122# Registry mapping glob patterns to (module_path, function_name, indexed).
123# Patterns are matched against journal-relative paths and must be specific
124# enough to use as Path.glob() arguments from the journal root. The indexed
125# flag controls whether find_formattable_files() collects matching files for
126# the search index. Adding a new journal content location requires a new
127# entry here — see docs/JOURNAL.md "Search Index" for details.
128#
129# Order matters: first match wins, so place specific patterns before general ones.
130FORMATTERS: dict[str, tuple[str, str, bool]] = {
131 # JSONL formatters (indexed)
132 "config/actions/*.jsonl": ("think.facets", "format_logs", True),
133 "facets/*/entities/*/observations.jsonl": (
134 "think.entities.formatting",
135 "format_observations",
136 True,
137 ),
138 "facets/*/entities/*.jsonl": ("think.entities.formatting", "format_entities", True),
139 "entities/*/entity.json": (
140 "think.entities.formatting",
141 "format_entity_identity",
142 False, # Indexed via _index_entity_search_chunks (enriched with relationship data)
143 ),
144 "facets/*/events/*.jsonl": ("think.events", "format_events", True),
145 "facets/*/calendar/*.jsonl": ("think.events", "format_events", True),
146 "facets/*/todos/*.jsonl": ("apps.todos.todo", "format_todos", True),
147 "facets/*/logs/*.jsonl": ("think.facets", "format_logs", True),
148 # Structured file imports (indexed)
149 "*/import.*/imported.jsonl": (
150 "think.importers.formatting",
151 "format_imported",
152 True,
153 ),
154 # Markdown transcript imports (new convention + legacy)
155 "*/import.*/*/*_transcript.md": ("think.markdown", "format_markdown", True),
156 "*/import.*/*/imported.md": ("think.markdown", "format_markdown", True),
157 # AI chat imports — dedicated formatter (new + legacy filenames)
158 "*/import.chatgpt/*/conversation_transcript.jsonl": (
159 "think.importers.formatting",
160 "format_ai_chat",
161 True,
162 ),
163 "*/import.claude/*/conversation_transcript.jsonl": (
164 "think.importers.formatting",
165 "format_ai_chat",
166 True,
167 ),
168 "*/import.gemini/*/conversation_transcript.jsonl": (
169 "think.importers.formatting",
170 "format_ai_chat",
171 True,
172 ),
173 # Legacy AI chat import filenames (backward compat)
174 "*/import.chatgpt/*/imported_audio.jsonl": (
175 "think.importers.formatting",
176 "format_ai_chat",
177 True,
178 ),
179 "*/import.claude/*/imported_audio.jsonl": (
180 "think.importers.formatting",
181 "format_ai_chat",
182 True,
183 ),
184 "*/import.gemini/*/imported_audio.jsonl": (
185 "think.importers.formatting",
186 "format_ai_chat",
187 True,
188 ),
189 # Raw transcripts — formattable but not indexed (agent outputs are more useful)
190 # Layout: day/stream/segment/audio.jsonl
191 "*/*/*/audio.jsonl": ("observe.hear", "format_audio", False),
192 "*/*/*/*_audio.jsonl": ("observe.hear", "format_audio", False),
193 "*/*/*/*_transcript.jsonl": ("observe.hear", "format_audio", False),
194 "*/*/*/screen.jsonl": ("observe.screen", "format_screen", False),
195 "*/*/*/*_screen.jsonl": ("observe.screen", "format_screen", False),
196 # Markdown — day-level agents output and segment-level (day/stream/segment/agents/)
197 "*/agents/*.md": ("think.markdown", "format_markdown", True),
198 # Layout: day/stream/segment/agents/*.md
199 "*/*/*/agents/*.md": ("think.markdown", "format_markdown", True),
200 "*/*/*/agents/*/*.md": ("think.markdown", "format_markdown", True),
201 "facets/*/activities/*/*/*.md": ("think.markdown", "format_markdown", True),
202 "facets/*/news/*.md": ("think.markdown", "format_markdown", True),
203 "imports/*/summary.md": ("think.markdown", "format_markdown", True),
204 "apps/*/agents/*.md": ("think.markdown", "format_markdown", True),
205}
206
207
208def get_formatter(file_path: str) -> Callable | None:
209 """Return formatter function for a journal-relative file path.
210
211 Matches against registered glob patterns (regardless of indexed flag).
212
213 Args:
214 file_path: Journal-relative path (e.g., "20240101/agents/flow.md")
215
216 Returns:
217 Formatter function or None if no pattern matches
218 """
219 for pattern, (module_path, func_name, _indexed) in FORMATTERS.items():
220 if fnmatch.fnmatch(file_path, pattern):
221 module = import_module(module_path)
222 return getattr(module, func_name)
223
224 return None
225
226
227def load_jsonl(file_path: str | Path) -> list[dict[str, Any]]:
228 """Load entries from a JSONL file.
229
230 Args:
231 file_path: Absolute path to JSONL file
232
233 Returns:
234 List of parsed JSON objects (one per line)
235 """
236 entries = []
237 with open(file_path, "r", encoding="utf-8") as f:
238 for line in f:
239 line = line.strip()
240 if not line:
241 continue
242 try:
243 entries.append(json.loads(line))
244 except json.JSONDecodeError:
245 continue
246 return entries
247
248
249def load_markdown(file_path: str | Path) -> str:
250 """Load text from a markdown file.
251
252 Args:
253 file_path: Absolute path to markdown file
254
255 Returns:
256 File contents as string
257 """
258 with open(file_path, "r", encoding="utf-8") as f:
259 return f.read()
260
261
262def find_formattable_files(journal: str) -> dict[str, str]:
263 """Find all indexable files in the journal.
264
265 Globs each indexed FORMATTERS pattern from the journal root to discover
266 files. The registry is the single source of truth for what gets indexed.
267
268 Args:
269 journal: Path to journal root directory
270
271 Returns:
272 Mapping of journal-relative paths to absolute paths
273 """
274 files: dict[str, str] = {}
275 journal_path = Path(journal)
276
277 for pattern, (_mod, _func, indexed) in FORMATTERS.items():
278 if not indexed:
279 continue
280 for match in journal_path.glob(pattern):
281 if match.is_file():
282 rel = str(match.relative_to(journal_path))
283 files[rel] = str(match)
284
285 return files
286
287
288def format_file(
289 file_path: str | Path,
290 context: dict[str, Any] | None = None,
291) -> tuple[list[dict[str, Any]], dict[str, Any]]:
292 """Load file, detect formatter, return formatted chunks and metadata.
293
294 File must be under the journal root. Supports JSONL, JSON, and Markdown files.
295
296 Args:
297 file_path: Absolute or journal-relative path to file
298 context: Optional context dict passed to formatter
299
300 Returns:
301 Tuple of (chunks, meta) where:
302 - chunks: List of dicts with "markdown" key (and optional "timestamp")
303 - meta: Dict with optional "header" and "error" keys
304
305 Raises:
306 ValueError: If file is outside journal or no formatter found
307 FileNotFoundError: If file doesn't exist
308 """
309 journal_path = Path(get_journal()).resolve()
310 file_path = Path(file_path).resolve()
311
312 if not file_path.exists():
313 raise FileNotFoundError(f"File not found: {file_path}")
314
315 # Require file to be under journal
316 if not file_path.is_relative_to(journal_path):
317 raise ValueError(f"File is outside journal directory: {file_path}")
318
319 rel_path = str(file_path.relative_to(journal_path))
320
321 formatter = get_formatter(rel_path)
322 if formatter is None:
323 raise ValueError(f"No formatter found for: {rel_path}")
324
325 # Load file based on extension
326 if file_path.suffix == ".md":
327 content = load_markdown(file_path)
328 elif file_path.suffix == ".json":
329 with open(file_path, encoding="utf-8") as f:
330 content = [json.load(f)]
331 else:
332 content = load_jsonl(file_path)
333
334 # Build context with file path info
335 ctx = context or {}
336 ctx.setdefault("file_path", file_path)
337
338 return formatter(content, ctx)
339
340
341def _format_chunk_summary(chunks: list[dict], raw_chunks: list[dict] | None) -> None:
342 """Print human-readable chunk summary (for markdown files with raw chunks)."""
343 print(f"Total chunks: {len(chunks)}\n")
344 for i, chunk in enumerate(chunks):
345 # Use raw chunk data if available, otherwise extract from markdown
346 if raw_chunks and i < len(raw_chunks):
347 c = raw_chunks[i]
348 chunk_type = c.get("type", "unknown")
349 header_path = c.get("header_path", [])
350 intro = c.get("intro")
351 preview = c.get("preview", "")
352 else:
353 chunk_type = "chunk"
354 header_path = []
355 intro = None
356 preview = chunk.get("markdown", "")[:70]
357
358 path = " > ".join(f"H{h['level']}:{h['text']}" for h in header_path)
359 print(f"#{i:3d} [{chunk_type:13s}]")
360 if path:
361 print(f" path: {path}")
362 if intro:
363 print(f' intro: "{intro[:60]}{"..." if len(intro) > 60 else ""}"')
364 print(f" {preview[:70]}{'...' if len(preview) > 70 else ''}")
365 print()
366
367
368def main() -> None:
369 """CLI entry point for sol formatter."""
370 from think.utils import setup_cli
371
372 parser = argparse.ArgumentParser(
373 description="Convert JSONL or Markdown files to formatted chunks"
374 )
375 parser.add_argument("file", help="Path to JSONL or Markdown file")
376 parser.add_argument(
377 "-f",
378 "--format",
379 choices=["json", "markdown", "summary"],
380 default="json",
381 help="Output format (default: json)",
382 )
383 parser.add_argument(
384 "-i",
385 "--index",
386 type=int,
387 help="Show only the chunk at this index",
388 )
389 parser.add_argument(
390 "--join",
391 action="store_true",
392 help="Output concatenated markdown (shorthand for --format=markdown)",
393 )
394 parser.add_argument(
395 "--context",
396 type=str,
397 help="JSON string of context to pass to formatter",
398 )
399 args = setup_cli(parser)
400
401 # --join is shorthand for --format=markdown
402 if args.join:
403 args.format = "markdown"
404
405 try:
406 context = json.loads(args.context) if args.context else None
407 except json.JSONDecodeError as e:
408 print(f"Error parsing context JSON: {e}", file=sys.stderr)
409 sys.exit(1)
410
411 try:
412 chunks, meta = format_file(args.file, context)
413 except (ValueError, FileNotFoundError) as e:
414 print(f"Error: {e}", file=sys.stderr)
415 sys.exit(1)
416
417 # For summary format on markdown files, get raw chunks with metadata
418 raw_chunks = None
419 if args.format == "summary" and args.file.endswith(".md"):
420 from think.markdown import chunk_markdown
421
422 text = load_markdown(args.file)
423 raw_chunks = chunk_markdown(text)
424
425 # Filter to single chunk if requested
426 if args.index is not None:
427 if 0 <= args.index < len(chunks):
428 chunks = [chunks[args.index]]
429 if raw_chunks:
430 raw_chunks = [raw_chunks[args.index]]
431 else:
432 print(
433 f"Error: Index {args.index} out of range (0-{len(chunks) - 1})",
434 file=sys.stderr,
435 )
436 sys.exit(1)
437
438 if args.format == "markdown":
439 # Output concatenated markdown with header first
440 parts = []
441 if meta.get("header"):
442 parts.append(meta["header"])
443 parts.extend(chunk["markdown"] for chunk in chunks)
444 print("\n".join(parts))
445 elif args.format == "summary":
446 _format_chunk_summary(chunks, raw_chunks)
447 else:
448 # Output JSON object with metadata and chunks
449 print(json.dumps({"meta": meta, "chunks": chunks}, indent=2))
450
451
452if __name__ == "__main__":
453 main()