Refactor think/entities.py into modular package with audit fixes

+23 -8

apps/speakers/tests/conftest.py

··· 122 122 facet_dir = self.journal / "facets" / facet 123 123 facet_dir.mkdir(parents=True, exist_ok=True) 124 124 125 - # Create entities.jsonl 126 - entities_file = facet_dir / "entities.jsonl" 127 - entity_data = {"type": "Person", "name": name, "description": "Test entity"} 128 - with open(entities_file, "a", encoding="utf-8") as f: 129 - f.write(json.dumps(entity_data) + "\n") 125 + # Create journal-level entity 126 + entity_id = entity_slug(name) 127 + journal_entity_dir = self.journal / "entities" / entity_id 128 + journal_entity_dir.mkdir(parents=True, exist_ok=True) 129 + journal_entity = { 130 + "id": entity_id, 131 + "name": name, 132 + "type": "Person", 133 + "created_at": 1700000000000, 134 + } 135 + with open(journal_entity_dir / "entity.json", "w", encoding="utf-8") as f: 136 + json.dump(journal_entity, f) 137 + 138 + # Create facet relationship 139 + entity_dir = facet_dir / "entities" / entity_id 140 + entity_dir.mkdir(parents=True, exist_ok=True) 141 + relationship = { 142 + "entity_id": entity_id, 143 + "description": "Test entity", 144 + } 145 + with open(entity_dir / "entity.json", "w", encoding="utf-8") as f: 146 + json.dump(relationship, f) 130 147 131 - # Create entity memory folder with consolidated voiceprints.npz if specified 148 + # Create voiceprints.npz if specified 132 149 if voiceprints: 133 - entity_dir = facet_dir / "entities" / entity_slug(name) 134 - entity_dir.mkdir(parents=True, exist_ok=True) 135 150 136 151 all_embeddings = [] 137 152 all_metadata = []

+1 -1

docs/APPS.md

··· 444 444 - `TodoChecklist` class - Load and manipulate todo markdown files 445 445 446 446 ### Entities 447 - `think/entities.py`: `get_entities(facet)` - Get entities for facet 447 + `think/entities/`: `load_entities(facet)` - Load entities for a facet 448 448 449 449 See [JOURNAL.md](JOURNAL.md), [CORTEX.md](CORTEX.md), [CALLOSUM.md](CALLOSUM.md) for subsystem details. 450 450

+61 -37

docs/JOURNAL.md

··· 338 338 Each facet is organized as `facets/<facet>/` where `<facet>` is a descriptive short unique name. When referencing facets in the system, use hashtags (e.g., `#personal` for the "Personal Life" facet, `#ml_research` for "Machine Learning Research"). Each facet folder contains: 339 339 340 340 - `facet.json` – metadata file with facet title and description. 341 - - `entities.jsonl` – entities specific to this facet in JSONL format. 342 - - `entities/` – daily detected entities (see [Facet Entities](#facet-entities)). 341 + - `entities/` – entity relationships and detected entities (see [Facet Entities](#facet-entities)). 343 342 - `todos/` – daily todo lists (see [Facet-Scoped Todos](#facet-scoped-todos)). 344 343 - `events/` – extracted events per day (see [Event extracts](#event-extracts)). 345 344 - `news/` – daily news and updates relevant to the facet (optional). ··· 365 364 366 365 ### Facet Entities 367 366 368 - Entities in solstone use a two-state system: **detected** (daily discoveries) and **attached** (promoted/persistent). This agent-driven architecture automatically identifies entities from journal content while allowing manual curation. 367 + Entities in solstone use a two-tier architecture with **journal-level entities** (canonical identity) and **facet relationships** (per-facet context). There are also **detected entities** (daily discoveries) that can be promoted to attached status. 369 368 370 369 #### Entity Storage Structure 371 370 372 371 ``` 372 + entities/ 373 + └── {entity_id}/ 374 + └── entity.json # Journal-level entity (canonical identity) 375 + 373 376 facets/{facet}/ 374 - ├── entities.jsonl # Attached entities (persistent) 375 377 └── entities/ 376 - ├── YYYYMMDD.jsonl # Daily detected entities 377 - └── {normalized_name}/ # Entity memory folder (optional) 378 + ├── YYYYMMDD.jsonl # Daily detected entities 379 + └── {entity_id}/ 380 + ├── entity.json # Facet relationship 381 + ├── observations.jsonl # Durable facts (optional) 382 + └── voiceprints.npz # Voice recognition data (optional) 378 383 ``` 379 384 380 - **Entity memory folders** store persistent data the system "remembers" about attached entities—observations (durable facts), voiceprints (voice recognition), and profile images. Folders are created on-demand when memory is added. The folder name is the entity name normalized to lowercase with underscores (e.g., "Alice Johnson" → `alice_johnson/`). Folders are renamed automatically when entities are renamed. 385 + **Journal-level entities** (`entities/<id>/entity.json`) store the canonical identity: name, type, aliases (aka), and principal flag. These are shared across all facets. 381 386 382 - #### Attached Entities 387 + **Facet relationships** (`facets/<facet>/entities/<id>/entity.json`) store per-facet context: description, timestamps, and custom fields specific to that facet. 383 388 384 - The `entities.jsonl` file contains manually promoted entities that are persistently associated with the facet. These entities are loaded into agent context and appear in the facet UI as starred items. 389 + **Entity memory** (observations, voiceprints) is stored alongside facet relationships. 390 + 391 + #### Journal-Level Entities 385 392 386 - **Entity names must be unique within a facet** (regardless of type). The `id` field provides a stable slug identifier for programmatic references. 393 + Journal entities represent the canonical identity record: 387 394 388 - Format example (JSONL - one JSON object per line): 389 - ```jsonl 390 - {"id": "alice_johnson", "type": "Person", "name": "Alice Johnson", "description": "Lead engineer on the API project", "aka": ["Ali", "AJ"]} 391 - {"id": "techcorp", "type": "Company", "name": "TechCorp", "description": "Primary client for consulting work", "tier": "enterprise", "aka": ["TC", "TechCo"]} 392 - {"id": "api_optimization", "type": "Project", "name": "API Optimization", "description": "Performance improvement initiative", "status": "active", "priority": "high"} 393 - {"id": "postgresql", "type": "Tool", "name": "PostgreSQL", "description": "Database system used in production", "version": "16.0", "aka": ["Postgres", "PG"]} 395 + ```json 396 + { 397 + "id": "alice_johnson", 398 + "name": "Alice Johnson", 399 + "type": "Person", 400 + "aka": ["Ali", "AJ"], 401 + "is_principal": false, 402 + "created_at": 1704067200000 403 + } 394 404 ``` 395 405 396 - Entity types are flexible and user-defined. Common examples: `Person`, `Company`, `Project`, `Tool`, `Location`, `Event`. Type names must be alphanumeric with spaces, minimum 3 characters. 397 - 398 - Each entity is a JSON object with required fields (`id`, `type`, `name`, `description`) and optional custom fields for extensibility (e.g., `status`, `priority`, `tags`, `contact`, etc.). Custom fields are preserved throughout the system. 399 - 400 406 **Standard fields:** 401 - - `id` (string) – Stable slug identifier derived from name via `entity_slug()` in `think/entities.py` (lowercase, spaces replaced with underscores, e.g., "Alice Johnson" → "alice_johnson"). Used for folder paths, URLs, and MCP tool references. Automatically regenerated when name changes. 402 - - `aka` (array of strings) – Alternative names, nicknames, or acronyms for the entity. Used in audio transcription to improve entity recognition. 403 - - `detached` (boolean) – When `true`, marks the entity as soft-deleted. Detached entities remain in the file but are hidden from UI and excluded from agent context. This preserves entity history and allows re-attachment without data loss. 407 + - `id` (string) – Stable slug identifier derived from name via `entity_slug()` in `think/entities/` (lowercase, underscores, e.g., "Alice Johnson" → "alice_johnson"). Used for folder paths, URLs, and MCP tool references. 408 + - `name` (string) – Display name for the entity. 409 + - `type` (string) – Entity type (e.g., "Person", "Company", "Project", "Tool"). Types are flexible and user-defined; must be alphanumeric with spaces, minimum 3 characters. 410 + - `aka` (array of strings) – Alternative names, nicknames, or acronyms. Used in audio transcription and fuzzy matching. 404 411 - `is_principal` (boolean) – When `true`, identifies this entity as the journal owner. Auto-flagged when name/aka matches identity config. 405 - - `attached_at` (integer) – Unix timestamp in milliseconds when entity was first attached. 406 - - `updated_at` (integer) – Unix timestamp in milliseconds of last modification. 407 - - `last_seen` (string) – Day in YYYYMMDD format when entity was last mentioned in journal content. Automatically updated after daily processing by parsing the knowledge graph and matching entity names via fuzzy matching. 412 + - `blocked` (boolean) – When `true`, entity is hidden from all facets and excluded from agent context. 413 + - `created_at` (integer) – Unix timestamp in milliseconds when entity was created. 414 + 415 + #### Facet Relationships 416 + 417 + Facet relationships link journal entities to specific facets with context: 418 + 419 + ```json 420 + { 421 + "entity_id": "alice_johnson", 422 + "description": "Lead engineer on the API project", 423 + "attached_at": 1704067200000, 424 + "updated_at": 1704153600000, 425 + "last_seen": "20260115" 426 + } 427 + ``` 428 + 429 + **Relationship fields:** 430 + - `entity_id` (string) – Links to the journal entity. 431 + - `description` (string) – Facet-specific description. 432 + - `attached_at` (integer) – Unix timestamp when attached to this facet. 433 + - `updated_at` (integer) – Unix timestamp of last modification. 434 + - `last_seen` (string) – Day (YYYYMMDD) when last mentioned in journal content. 435 + - `detached` (boolean) – When `true`, soft-deleted from this facet but data preserved. 436 + - Custom fields (any) – Additional facet-specific metadata (e.g., `tier`, `status`, `priority`). 408 437 409 438 #### Detected Entities 410 439 411 - Daily entity detection files (`entities/YYYYMMDD.jsonl`) contain entities automatically discovered by agents from: 412 - - Journal transcripts and screen captures 413 - - Knowledge graphs and insights 414 - - News feeds and external content 415 - 416 - Detected entities accumulate historical context over time. Entities appearing in multiple daily detections can be promoted to attached status through the web UI or MCP tools. 440 + Daily detection files (`facets/<facet>/entities/YYYYMMDD.jsonl`) contain entities automatically discovered by agents from journal content: 417 441 418 - Format matches attached entities (JSONL): 419 442 ```jsonl 420 443 {"type": "Person", "name": "Charlie Brown", "description": "Mentioned in standup meeting"} 421 444 {"type": "Tool", "name": "React", "description": "Used in UI development work"} ··· 426 449 1. **Detection**: Daily agents scan journal content and record entities in `entities/YYYYMMDD.jsonl` 427 450 2. **Aggregation**: Review agent tracks detection frequency across recent days 428 451 3. **Promotion**: Entities with 3+ detections are auto-promoted to attached, or users manually promote via UI 429 - 4. **Persistence**: Attached entities in `entities.jsonl` remain active until detached 430 - 5. **Detachment**: When removed via UI, entities are soft-deleted (`detached: true`) preserving all metadata 431 - 6. **Re-attachment**: Detached entities can be re-activated, restoring them with preserved history (original `attached_at`, updated `updated_at`) 452 + 4. **Persistence**: Creates journal entity + facet relationship; remains active until detached 453 + 5. **Detachment**: Sets `detached: true` on facet relationship, preserving all data 454 + 6. **Re-attachment**: Clears detached flag, restoring the entity with preserved history 455 + 7. **Blocking**: Sets `blocked: true` on journal entity and detaches from all facets 432 456 433 457 #### Cross-Facet Behavior 434 458 435 - The same entity name can exist in multiple facets with independent descriptions. Agents receive entity context from all facets, with alphabetically-first facet winning for name conflicts during aggregation. 459 + The same entity can be attached to multiple facets with independent descriptions and timestamps. When loading entities across all facets, the alphabetically-first facet wins for duplicates during aggregation. 436 460 437 461 ### Facet News 438 462

-3

fixtures/journal/facets/full-featured/entities.jsonl

··· 1 - {"id": "first_test_entity", "type": "Entity 1", "name": "First test entity", "description": ""} 2 - {"id": "second_test_entity", "type": "Entity 2", "name": "Second test entity", "description": ""} 3 - {"id": "third_test_entity_with_description", "type": "Entity 3", "name": "Third test entity with description", "description": ""}

-3

fixtures/journal/facets/personal/entities.jsonl

··· 1 - {"id": "acme_corp", "type": "Company", "name": "Acme Corp", "description": "Local tech startup", "tags": ["tech", "startup"], "founded": "2020"} 2 - {"id": "alice_johnson", "type": "Person", "name": "Alice Johnson", "description": "Close friend from college", "tags": ["friend"], "contact": "alice@example.com"} 3 - {"id": "bob_smith", "type": "Person", "name": "Bob Smith", "description": "Neighbor", "relationship": "neighbor", "since": "2022"}

-10

fixtures/journal/facets/test-facet/entities.jsonl

··· 1 - {"id": "acme_corp", "type": "Company", "name": "Acme Corp", "description": "Main client", "status": "active", "revenue": "high"} 2 - {"id": "tech_solutions_inc", "type": "Company", "name": "Tech Solutions Inc", "description": "Partner company", "status": "active"} 3 - {"id": "bob_wilson", "type": "Person", "name": "Bob Wilson", "description": "QA engineer", "role": "engineer", "team": "qa"} 4 - {"id": "jane_doe", "type": "Person", "name": "Jane Doe", "description": "Project manager", "role": "manager", "team": "product"} 5 - {"id": "john_smith", "type": "Person", "name": "John Smith", "description": "Lead developer on the project", "role": "lead", "team": "engineering", "skills": ["python", "javascript"]} 6 - {"id": "api_optimization", "type": "Project", "name": "API Optimization", "description": "Performance improvement initiative", "status": "in-progress", "priority": "high"} 7 - {"id": "dashboard_redesign", "type": "Project", "name": "Dashboard Redesign", "description": "UI/UX overhaul project", "status": "planning", "priority": "medium"} 8 - {"id": "docker", "type": "Tool", "name": "Docker", "description": "Container platform", "version": "24.0", "url": "https://docker.com"} 9 - {"id": "postgresql", "type": "Tool", "name": "PostgreSQL", "description": "Database system", "version": "16.0"} 10 - {"id": "visual_studio_code", "type": "Tool", "name": "Visual Studio Code", "description": "Primary IDE", "version": "1.85"}

+3 -10

tests/test_entities.py

··· 13 13 add_observation, 14 14 block_journal_entity, 15 15 delete_journal_entity, 16 + detected_entities_path, 16 17 ensure_entity_memory, 17 - entity_file_path, 18 18 entity_last_active_ts, 19 19 entity_memory_path, 20 20 entity_slug, ··· 139 139 assert ts == 1600000000000 140 140 141 141 142 - def test_entity_file_path_attached(fixture_journal): 143 - """Test path generation for attached entities.""" 144 - path = entity_file_path("personal") 145 - assert str(path).endswith("fixtures/journal/facets/personal/entities.jsonl") 146 - assert path.name == "entities.jsonl" 147 - 148 - 149 - def test_entity_file_path_detected(fixture_journal): 142 + def test_detected_entities_path(fixture_journal): 150 143 """Test path generation for detected entities.""" 151 - path = entity_file_path("personal", "20250101") 144 + path = detected_entities_path("personal", "20250101") 152 145 assert str(path).endswith( 153 146 "fixtures/journal/facets/personal/entities/20250101.jsonl" 154 147 )

+95 -24

tests/test_facets.py

··· 3 3 4 4 """Tests for think.facets module.""" 5 5 6 + import json 6 7 from pathlib import Path 7 8 8 9 import pytest 10 + from slugify import slugify 9 11 10 12 from think.facets import ( 11 13 _format_principal_role, ··· 18 20 19 21 # Use the permanent fixtures in fixtures/journal/facets/ 20 22 FIXTURES_PATH = Path(__file__).parent.parent / "fixtures" / "journal" 23 + 24 + 25 + def setup_entities_new_structure( 26 + journal_path: Path, 27 + facet: str, 28 + entities: list[dict], 29 + ): 30 + """Helper to set up entities using the new structure for tests. 31 + 32 + Creates both journal-level entity files and facet relationship files. 33 + 34 + Args: 35 + journal_path: Path to journal root 36 + facet: Facet name (e.g., "work") 37 + entities: List of entity dicts with type, name, description, etc. 38 + """ 39 + for entity in entities: 40 + etype = entity.get("type", "") 41 + name = entity.get("name", "") 42 + desc = entity.get("description", "") 43 + is_principal = entity.get("is_principal", False) 44 + 45 + entity_id = slugify(name, separator="_") 46 + if not entity_id: 47 + continue 48 + 49 + # Create journal-level entity 50 + journal_entity_dir = journal_path / "entities" / entity_id 51 + journal_entity_dir.mkdir(parents=True, exist_ok=True) 52 + journal_entity = {"id": entity_id, "name": name, "type": etype} 53 + if is_principal: 54 + journal_entity["is_principal"] = True 55 + with open(journal_entity_dir / "entity.json", "w", encoding="utf-8") as f: 56 + json.dump(journal_entity, f) 57 + 58 + # Create facet relationship 59 + facet_entity_dir = journal_path / "facets" / facet / "entities" / entity_id 60 + facet_entity_dir.mkdir(parents=True, exist_ok=True) 61 + relationship = {"entity_id": entity_id, "description": desc} 62 + with open(facet_entity_dir / "entity.json", "w", encoding="utf-8") as f: 63 + json.dump(relationship, f) 21 64 22 65 23 66 def test_facet_summary_full(monkeypatch): ··· 157 200 # Entities are no longer included in get_facets() 158 201 assert "entities" not in minimal_facet 159 202 160 - # Verify load_entity_names returns None for facets without entities.jsonl 203 + # Verify load_entity_names returns None for facets without entities 161 204 from think.entities import load_entity_names 162 205 163 206 entity_names = load_entity_names(facet="minimal-facet") ··· 450 493 451 494 def test_facet_summary_with_principal(tmp_path, monkeypatch): 452 495 """Test facet_summary shows principal role and excludes from entities list.""" 453 - import json 454 - 455 496 monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 456 497 457 498 # Create identity config ··· 460 501 config = {"identity": {"name": "Test User", "preferred": "Tester"}} 461 502 (config_dir / "journal.json").write_text(json.dumps(config)) 462 503 463 - # Create facet with principal entity 504 + # Create facet with principal entity using new structure 464 505 facet_dir = tmp_path / "facets" / "work" 465 506 facet_dir.mkdir(parents=True) 466 507 (facet_dir / "facet.json").write_text( 467 508 json.dumps({"title": "Work", "description": "Work stuff"}) 468 509 ) 469 - (facet_dir / "entities.jsonl").write_text( 470 - '{"type": "Person", "name": "Test User", "description": "Lead developer", "is_principal": true}\n' 471 - '{"type": "Person", "name": "Alice", "description": "Colleague"}\n' 510 + setup_entities_new_structure( 511 + tmp_path, 512 + "work", 513 + [ 514 + { 515 + "type": "Person", 516 + "name": "Test User", 517 + "description": "Lead developer", 518 + "is_principal": True, 519 + }, 520 + {"type": "Person", "name": "Alice", "description": "Colleague"}, 521 + ], 472 522 ) 473 523 474 524 summary = facet_summary("work") ··· 485 535 486 536 def test_facet_summary_principal_only_entity(tmp_path, monkeypatch): 487 537 """Test facet_summary when principal is the only entity.""" 488 - import json 489 - 490 538 monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 491 539 492 540 # Create identity config ··· 495 543 config = {"identity": {"name": "Test User", "preferred": "Tester"}} 496 544 (config_dir / "journal.json").write_text(json.dumps(config)) 497 545 498 - # Create facet with only principal entity 546 + # Create facet with only principal entity using new structure 499 547 facet_dir = tmp_path / "facets" / "solo" 500 548 facet_dir.mkdir(parents=True) 501 549 (facet_dir / "facet.json").write_text(json.dumps({"title": "Solo"})) 502 - (facet_dir / "entities.jsonl").write_text( 503 - '{"type": "Person", "name": "Test User", "description": "Just me", "is_principal": true}\n' 550 + setup_entities_new_structure( 551 + tmp_path, 552 + "solo", 553 + [ 554 + { 555 + "type": "Person", 556 + "name": "Test User", 557 + "description": "Just me", 558 + "is_principal": True, 559 + }, 560 + ], 504 561 ) 505 562 506 563 summary = facet_summary("solo") ··· 513 570 514 571 def test_facet_summaries_detailed_with_principal(tmp_path, monkeypatch): 515 572 """Test facet_summaries detailed mode shows principal role.""" 516 - import json 517 - 518 573 monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 519 574 520 575 # Create identity config ··· 523 578 config = {"identity": {"name": "Test User", "preferred": "Tester"}} 524 579 (config_dir / "journal.json").write_text(json.dumps(config)) 525 580 526 - # Create facet with principal 581 + # Create facet with principal using new structure 527 582 facet_dir = tmp_path / "facets" / "project" 528 583 facet_dir.mkdir(parents=True) 529 584 (facet_dir / "facet.json").write_text( 530 585 json.dumps({"title": "Project X", "description": "Secret project"}) 531 586 ) 532 - (facet_dir / "entities.jsonl").write_text( 533 - '{"type": "Person", "name": "Test User", "description": "Project lead", "is_principal": true}\n' 534 - '{"type": "Person", "name": "Bob", "description": "Team member"}\n' 587 + setup_entities_new_structure( 588 + tmp_path, 589 + "project", 590 + [ 591 + { 592 + "type": "Person", 593 + "name": "Test User", 594 + "description": "Project lead", 595 + "is_principal": True, 596 + }, 597 + {"type": "Person", "name": "Bob", "description": "Team member"}, 598 + ], 535 599 ) 536 600 537 601 summary = facet_summaries(detailed_entities=True) ··· 546 610 547 611 def test_facet_summaries_simple_mode_with_principal(tmp_path, monkeypatch): 548 612 """Test facet_summaries simple mode also filters principal consistently.""" 549 - import json 550 - 551 613 monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 552 614 553 615 # Create identity config ··· 556 618 config = {"identity": {"name": "Test User", "preferred": "Tester"}} 557 619 (config_dir / "journal.json").write_text(json.dumps(config)) 558 620 559 - # Create facet with principal 621 + # Create facet with principal using new structure 560 622 facet_dir = tmp_path / "facets" / "simple" 561 623 facet_dir.mkdir(parents=True) 562 624 (facet_dir / "facet.json").write_text(json.dumps({"title": "Simple"})) 563 - (facet_dir / "entities.jsonl").write_text( 564 - '{"type": "Person", "name": "Test User", "description": "Me", "is_principal": true}\n' 565 - '{"type": "Person", "name": "Bob", "description": "Friend"}\n' 625 + setup_entities_new_structure( 626 + tmp_path, 627 + "simple", 628 + [ 629 + { 630 + "type": "Person", 631 + "name": "Test User", 632 + "description": "Me", 633 + "is_principal": True, 634 + }, 635 + {"type": "Person", "name": "Bob", "description": "Friend"}, 636 + ], 566 637 ) 567 638 568 639 summary = facet_summaries(detailed_entities=False)

-20

tests/test_formatters.py

··· 715 715 class TestFormatEntities: 716 716 """Tests for the entities formatter.""" 717 717 718 - def test_get_formatter_attached_entities(self): 719 - """Test pattern matching for attached entities.""" 720 - from think.formatters import get_formatter 721 - 722 - formatter = get_formatter("facets/personal/entities.jsonl") 723 - assert formatter is not None 724 - assert formatter.__name__ == "format_entities" 725 - 726 718 def test_get_formatter_detected_entities(self): 727 719 """Test pattern matching for detected entities.""" 728 720 from think.formatters import get_formatter ··· 730 722 formatter = get_formatter("facets/personal/entities/20250101.jsonl") 731 723 assert formatter is not None 732 724 assert formatter.__name__ == "format_entities" 733 - 734 - def test_format_entities_attached_basic(self): 735 - """Test basic attached entities formatting with fixture file.""" 736 - from think.formatters import format_file 737 - 738 - path = Path(os.environ["JOURNAL_PATH"]) / "facets/personal/entities.jsonl" 739 - chunks, meta = format_file(path) 740 - 741 - assert len(chunks) == 3 # 3 entities in fixture 742 - assert "header" in meta 743 - assert "Attached Entities: personal" in meta["header"] 744 - assert "3 entities" in meta["header"] 745 725 746 726 def test_format_entities_detected_basic(self): 747 727 """Test basic detected entities formatting with fixture file."""

+161 -175

tests/test_think_utils.py

··· 16 16 from think.utils import segment_key, setup_cli 17 17 18 18 19 - def write_entities_jsonl(path: Path, entities: list[tuple[str, str, str]] | list[dict]): 20 - """Helper to write entities in JSONL format for tests. 19 + def setup_entities_new_structure( 20 + journal_path: Path, 21 + facet: str, 22 + entities: list[tuple[str, str, str]] | list[dict], 23 + ): 24 + """Helper to set up entities using the new structure for tests. 25 + 26 + Creates both journal-level entity files and facet relationship files. 21 27 22 28 Args: 23 - path: Path to entities.jsonl file to write 29 + journal_path: Path to journal root 30 + facet: Facet name (e.g., "test") 24 31 entities: Either list of (type, name, desc) tuples or list of entity dicts 25 32 """ 26 - path.parent.mkdir(parents=True, exist_ok=True) 27 - with open(path, "w", encoding="utf-8") as f: 28 - for item in entities: 29 - if isinstance(item, dict): 30 - entity = item 31 - else: 32 - etype, name, desc = item 33 - entity = {"type": etype, "name": name, "description": desc} 34 - f.write(json.dumps(entity, ensure_ascii=False) + "\n") 33 + from slugify import slugify 34 + 35 + for item in entities: 36 + if isinstance(item, dict): 37 + etype = item.get("type", "") 38 + name = item.get("name", "") 39 + desc = item.get("description", "") 40 + aka = item.get("aka", []) 41 + else: 42 + etype, name, desc = item 43 + aka = [] 44 + 45 + entity_id = slugify(name, separator="_") 46 + if not entity_id: 47 + continue 48 + 49 + # Create journal-level entity 50 + journal_entity_dir = journal_path / "entities" / entity_id 51 + journal_entity_dir.mkdir(parents=True, exist_ok=True) 52 + journal_entity = {"id": entity_id, "name": name, "type": etype} 53 + if aka: 54 + journal_entity["aka"] = aka 55 + with open(journal_entity_dir / "entity.json", "w", encoding="utf-8") as f: 56 + json.dump(journal_entity, f) 57 + 58 + # Create facet relationship 59 + facet_entity_dir = journal_path / "facets" / facet / "entities" / entity_id 60 + facet_entity_dir.mkdir(parents=True, exist_ok=True) 61 + relationship = {"entity_id": entity_id, "description": desc} 62 + with open(facet_entity_dir / "entity.json", "w", encoding="utf-8") as f: 63 + json.dump(relationship, f) 35 64 36 65 37 66 def test_load_entity_names_with_valid_file(monkeypatch): 38 - """Test loading entity names from a valid entities.jsonl file.""" 67 + """Test loading entity names from entities.""" 39 68 with tempfile.TemporaryDirectory() as tmpdir: 40 - entities_path = Path(tmpdir) / "facets" / "test" / "entities.jsonl" 41 - write_entities_jsonl( 42 - entities_path, 69 + setup_entities_new_structure( 70 + Path(tmpdir), 71 + "test", 43 72 [ 44 73 ("Person", "John Smith", "A software engineer at Google"), 45 74 ("Company", "Acme Corp", "Technology company based in SF"), ··· 52 81 53 82 monkeypatch.setenv("JOURNAL_PATH", tmpdir) 54 83 result = load_entity_names() 55 - assert ( 56 - result == "John Smith; Acme Corp; Project X; Hammer; Jane Doe; Widget Inc" 57 - ) 58 84 59 85 # Check that names are extracted without duplicates 60 86 names = result.split("; ") ··· 75 101 assert result is None 76 102 77 103 78 - def test_load_entity_names_empty_file(monkeypatch): 79 - """Test that empty file returns None.""" 104 + def test_load_entity_names_empty_facet(monkeypatch): 105 + """Test that empty facet returns None.""" 80 106 with tempfile.TemporaryDirectory() as tmpdir: 81 - entities_path = Path(tmpdir) / "facets" / "test" / "entities.jsonl" 82 - entities_path.parent.mkdir(parents=True, exist_ok=True) 83 - entities_path.write_text("") 107 + # Create facet directory but no entities 108 + facet_dir = Path(tmpdir) / "facets" / "test" 109 + facet_dir.mkdir(parents=True, exist_ok=True) 84 110 85 111 monkeypatch.setenv("JOURNAL_PATH", tmpdir) 86 112 result = load_entity_names() ··· 88 114 89 115 90 116 def test_load_entity_names_no_valid_entries(monkeypatch): 91 - """Test file with no parseable entity lines returns None.""" 117 + """Test empty entities directory returns None.""" 92 118 with tempfile.TemporaryDirectory() as tmpdir: 93 - entities_path = Path(tmpdir) / "facets" / "test" / "entities.jsonl" 94 - entities_path.parent.mkdir(parents=True, exist_ok=True) 95 - # Write malformed JSON 96 - entities_path.write_text(""" 97 - # Header comment 98 - Some random text 99 - Not valid JSON 100 - """) 119 + # Create entities directory but no entity subdirectories 120 + entities_dir = Path(tmpdir) / "facets" / "test" / "entities" 121 + entities_dir.mkdir(parents=True, exist_ok=True) 101 122 102 123 monkeypatch.setenv("JOURNAL_PATH", tmpdir) 103 124 result = load_entity_names() ··· 105 126 106 127 107 128 def test_load_entity_names_with_duplicates(monkeypatch): 108 - """Test that duplicate names are filtered out.""" 129 + """Test that duplicate names are filtered out (by entity id).""" 109 130 with tempfile.TemporaryDirectory() as tmpdir: 110 - entities_path = Path(tmpdir) / "facets" / "test" / "entities.jsonl" 111 - write_entities_jsonl( 112 - entities_path, 131 + # With new structure, same entity_id means same entity 132 + # Can't have true duplicates - just test two entities 133 + setup_entities_new_structure( 134 + Path(tmpdir), 135 + "test", 113 136 [ 114 137 ("Person", "John Smith", "Engineer"), 115 138 ("Company", "Acme Corp", "Tech company"), 116 - ("Person", "John Smith", "Also an engineer"), 117 - ("Company", "Acme Corp", "Still a tech company"), 118 139 ], 119 140 ) 120 141 121 142 monkeypatch.setenv("JOURNAL_PATH", tmpdir) 122 143 result = load_entity_names() 123 - assert result == "John Smith; Acme Corp" 124 144 125 145 names = result.split("; ") 126 146 assert len(names) == 2 147 + assert "John Smith" in names 148 + assert "Acme Corp" in names 127 149 128 150 129 151 def test_load_entity_names_handles_special_characters(monkeypatch): 130 152 """Test that names with special characters are handled correctly.""" 131 153 with tempfile.TemporaryDirectory() as tmpdir: 132 - entities_path = Path(tmpdir) / "facets" / "test" / "entities.jsonl" 133 - write_entities_jsonl( 134 - entities_path, 154 + setup_entities_new_structure( 155 + Path(tmpdir), 156 + "test", 135 157 [ 136 158 ("Person", "Jean-Pierre O'Malley", "Engineer"), 137 159 ("Company", "AT&T", "Telecom company"), ··· 151 173 def test_load_entity_names_with_env_var(monkeypatch): 152 174 """Test loading using JOURNAL_PATH environment variable.""" 153 175 with tempfile.TemporaryDirectory() as tmpdir: 154 - entities_path = Path(tmpdir) / "facets" / "test" / "entities.jsonl" 155 - write_entities_jsonl( 156 - entities_path, 176 + setup_entities_new_structure( 177 + Path(tmpdir), 178 + "test", 157 179 [("Person", "Test User", "A test person")], 158 180 ) 159 181 ··· 181 203 def test_load_entity_names_spoken_mode(monkeypatch): 182 204 """Test spoken mode returns shortened forms with uniform processing for all types.""" 183 205 with tempfile.TemporaryDirectory() as tmpdir: 184 - entities_path = Path(tmpdir) / "facets" / "test" / "entities.jsonl" 185 - write_entities_jsonl( 186 - entities_path, 206 + setup_entities_new_structure( 207 + Path(tmpdir), 208 + "test", 187 209 [ 188 210 ("Person", "Jeremie Miller (Jer)", "Software engineer"), 189 211 ("Person", "Jane Elizabeth Doe", "Product manager"), ··· 238 260 def test_load_entity_names_spoken_mode_with_tools(monkeypatch): 239 261 """Test spoken mode includes tools with uniform processing.""" 240 262 with tempfile.TemporaryDirectory() as tmpdir: 241 - entities_path = Path(tmpdir) / "facets" / "test" / "entities.jsonl" 242 - write_entities_jsonl( 243 - entities_path, 263 + setup_entities_new_structure( 264 + Path(tmpdir), 265 + "test", 244 266 [ 245 267 ("Tool", "Hammer", "For hitting things"), 246 268 ("Tool", "Docker", "Container runtime"), ··· 258 280 def test_load_entity_names_spoken_mode_duplicates(monkeypatch): 259 281 """Test spoken mode filters out duplicate shortened forms.""" 260 282 with tempfile.TemporaryDirectory() as tmpdir: 261 - entities_path = Path(tmpdir) / "facets" / "test" / "entities.jsonl" 262 - write_entities_jsonl( 263 - entities_path, 283 + setup_entities_new_structure( 284 + Path(tmpdir), 285 + "test", 264 286 [ 265 287 ("Person", "John Smith", "Engineer"), 266 288 ("Person", "John Doe", "Manager"), ··· 280 302 def test_load_entity_names_uniform_processing(monkeypatch): 281 303 """Test that uniform processing works correctly for all entity types.""" 282 304 with tempfile.TemporaryDirectory() as tmpdir: 283 - entities_path = Path(tmpdir) / "facets" / "test" / "entities.jsonl" 284 - write_entities_jsonl( 285 - entities_path, 305 + setup_entities_new_structure( 306 + Path(tmpdir), 307 + "test", 286 308 [ 287 309 ("Person", "Ryan Reed (R2)", "Software developer"), 288 310 ( ··· 329 351 def test_load_entity_names_with_aka_field(monkeypatch): 330 352 """Test that aka field values are included in spoken mode.""" 331 353 with tempfile.TemporaryDirectory() as tmpdir: 332 - entities_path = Path(tmpdir) / "facets" / "test" / "entities.jsonl" 333 - entities_path.parent.mkdir(parents=True, exist_ok=True) 334 - 335 - # Write entities with aka fields using manual JSON 336 - with open(entities_path, "w", encoding="utf-8") as f: 337 - f.write( 338 - json.dumps( 339 - { 340 - "type": "Person", 341 - "name": "Alice Johnson", 342 - "description": "Lead engineer", 343 - "aka": ["Ali", "AJ"], 344 - } 345 - ) 346 - + "\n" 347 - ) 348 - f.write( 349 - json.dumps( 350 - { 351 - "type": "Company", 352 - "name": "PostgreSQL", 353 - "description": "Database system", 354 - "aka": ["Postgres", "PG"], 355 - } 356 - ) 357 - + "\n" 358 - ) 359 - f.write( 360 - json.dumps( 361 - { 362 - "type": "Tool", 363 - "name": "Docker Container (Docker)", 364 - "description": "Container runtime", 365 - "aka": ["Dock"], 366 - } 367 - ) 368 - + "\n" 369 - ) 354 + setup_entities_new_structure( 355 + Path(tmpdir), 356 + "test", 357 + [ 358 + { 359 + "type": "Person", 360 + "name": "Alice Johnson", 361 + "description": "Lead engineer", 362 + "aka": ["Ali", "AJ"], 363 + }, 364 + { 365 + "type": "Company", 366 + "name": "PostgreSQL", 367 + "description": "Database system", 368 + "aka": ["Postgres", "PG"], 369 + }, 370 + { 371 + "type": "Tool", 372 + "name": "Docker Container (Docker)", 373 + "description": "Container runtime", 374 + "aka": ["Dock"], 375 + }, 376 + ], 377 + ) 370 378 371 379 monkeypatch.setenv("JOURNAL_PATH", tmpdir) 372 380 result = load_entity_names(spoken=True) ··· 396 404 def test_load_entity_names_aka_with_parens(monkeypatch): 397 405 """Test that aka entries with parentheses are processed correctly.""" 398 406 with tempfile.TemporaryDirectory() as tmpdir: 399 - entities_path = Path(tmpdir) / "facets" / "test" / "entities.jsonl" 400 - entities_path.parent.mkdir(parents=True, exist_ok=True) 401 - 402 - with open(entities_path, "w", encoding="utf-8") as f: 403 - f.write( 404 - json.dumps( 405 - { 406 - "type": "Person", 407 - "name": "Robert Smith", 408 - "description": "Manager", 409 - "aka": ["Bob Smith (Bobby)", "Rob"], 410 - } 411 - ) 412 - + "\n" 413 - ) 407 + setup_entities_new_structure( 408 + Path(tmpdir), 409 + "test", 410 + [ 411 + { 412 + "type": "Person", 413 + "name": "Robert Smith", 414 + "description": "Manager", 415 + "aka": ["Bob Smith (Bobby)", "Rob"], 416 + }, 417 + ], 418 + ) 414 419 415 420 monkeypatch.setenv("JOURNAL_PATH", tmpdir) 416 421 result = load_entity_names(spoken=True) ··· 431 436 def test_load_entity_names_aka_deduplication(monkeypatch): 432 437 """Test that aka values are deduplicated with main names.""" 433 438 with tempfile.TemporaryDirectory() as tmpdir: 434 - entities_path = Path(tmpdir) / "facets" / "test" / "entities.jsonl" 435 - entities_path.parent.mkdir(parents=True, exist_ok=True) 436 - 437 - with open(entities_path, "w", encoding="utf-8") as f: 438 - # First entity has "John" in aka 439 - f.write( 440 - json.dumps( 441 - { 442 - "type": "Person", 443 - "name": "Alice", 444 - "description": "Person 1", 445 - "aka": ["John"], 446 - } 447 - ) 448 - + "\n" 449 - ) 450 - # Second entity has "John" as main name 451 - f.write( 452 - json.dumps( 453 - {"type": "Person", "name": "John Smith", "description": "Person 2"} 454 - ) 455 - + "\n" 456 - ) 439 + setup_entities_new_structure( 440 + Path(tmpdir), 441 + "test", 442 + [ 443 + # First entity has "John" in aka 444 + { 445 + "type": "Person", 446 + "name": "Alice", 447 + "description": "Person 1", 448 + "aka": ["John"], 449 + }, 450 + # Second entity has "John" as main name 451 + {"type": "Person", "name": "John Smith", "description": "Person 2"}, 452 + ], 453 + ) 457 454 458 455 monkeypatch.setenv("JOURNAL_PATH", tmpdir) 459 456 result = load_entity_names(spoken=True) ··· 466 463 def test_load_entity_names_non_spoken_with_aka(monkeypatch): 467 464 """Test non-spoken mode includes aka values in parentheses.""" 468 465 with tempfile.TemporaryDirectory() as tmpdir: 469 - entities_path = Path(tmpdir) / "facets" / "test" / "entities.jsonl" 470 - entities_path.parent.mkdir(parents=True, exist_ok=True) 471 - 472 - with open(entities_path, "w", encoding="utf-8") as f: 473 - # Entity with aka values 474 - f.write( 475 - json.dumps( 476 - { 477 - "type": "Person", 478 - "name": "Alice Johnson", 479 - "description": "Lead engineer", 480 - "aka": ["Ali", "AJ"], 481 - } 482 - ) 483 - + "\n" 484 - ) 485 - # Entity without aka 486 - f.write( 487 - json.dumps( 488 - { 489 - "type": "Company", 490 - "name": "TechCorp", 491 - "description": "Tech company", 492 - } 493 - ) 494 - + "\n" 495 - ) 496 - # Entity with single aka 497 - f.write( 498 - json.dumps( 499 - { 500 - "type": "Tool", 501 - "name": "PostgreSQL", 502 - "description": "Database", 503 - "aka": ["Postgres", "PG"], 504 - } 505 - ) 506 - + "\n" 507 - ) 466 + setup_entities_new_structure( 467 + Path(tmpdir), 468 + "test", 469 + [ 470 + # Entity with aka values 471 + { 472 + "type": "Person", 473 + "name": "Alice Johnson", 474 + "description": "Lead engineer", 475 + "aka": ["Ali", "AJ"], 476 + }, 477 + # Entity without aka 478 + { 479 + "type": "Company", 480 + "name": "TechCorp", 481 + "description": "Tech company", 482 + }, 483 + # Entity with multiple aka 484 + { 485 + "type": "Tool", 486 + "name": "PostgreSQL", 487 + "description": "Database", 488 + "aka": ["Postgres", "PG"], 489 + }, 490 + ], 491 + ) 508 492 509 493 monkeypatch.setenv("JOURNAL_PATH", tmpdir) 510 494 result = load_entity_names(spoken=False) 511 495 512 - # Should be semicolon-delimited with aka in parentheses 513 - assert result == "Alice Johnson (Ali, AJ); TechCorp; PostgreSQL (Postgres, PG)" 496 + # Check all entities are present with their aka 497 + assert "Alice Johnson (Ali, AJ)" in result 498 + assert "TechCorp" in result 499 + assert "PostgreSQL (Postgres, PG)" in result 514 500 515 501 516 502 def test_segment_key_hhmmss_with_duration():

-2262

think/entities.py

··· 1 - # SPDX-License-Identifier: AGPL-3.0-only 2 - # Copyright (c) 2026 sol pbc 3 - 4 - """Entity management with journal-wide identity and facet-scoped relationships. 5 - 6 - Entity System Architecture: 7 - - Journal-level entities: entities/<id>/entity.json - canonical identity (name, type, aka) 8 - - Facet relationships: facets/<facet>/entities/<id>/entity.json - per-facet data 9 - - Detected entities: facets/<facet>/entities/<day>.jsonl - ephemeral daily discoveries 10 - - Entity memory: facets/<facet>/entities/<id>/ - voiceprints, observations (per-facet) 11 - 12 - The system supports both the new structure and legacy entities.jsonl files for 13 - backwards compatibility during migration. 14 - """ 15 - 16 - import hashlib 17 - import json 18 - import os 19 - import re 20 - import shutil 21 - import tempfile 22 - import time 23 - from pathlib import Path 24 - from typing import Any, Optional 25 - 26 - from slugify import slugify 27 - 28 - from think.utils import get_config, get_journal 29 - 30 - # Default timestamp for entities without activity data (Jan 1 2026 00:00:00 UTC) 31 - # Used as fallback in entity_last_active_ts() to ensure all entities have a sortable value 32 - DEFAULT_ACTIVITY_TS = 1767225600000 33 - 34 - # Standard entity types - used for UI suggestions and documentation. 35 - # Custom types are still allowed (validated by is_valid_entity_type regex). 36 - ENTITY_TYPES = [ 37 - {"name": "Person"}, 38 - {"name": "Company"}, 39 - {"name": "Project"}, 40 - {"name": "Tool"}, 41 - ] 42 - 43 - 44 - def get_identity_names() -> list[str]: 45 - """Get all names/aliases for the journal principal from identity config. 46 - 47 - Returns a list of names to match against entities, in display priority order: 48 - 1. identity.preferred (nickname/preferred name) - best for display 49 - 2. identity.name (full name) 50 - 3. identity.aliases (list of alternative names) 51 - 52 - The first element (if any) is the best name for display purposes. 53 - Returns empty list if identity is not configured. 54 - """ 55 - config = get_config() 56 - identity = config.get("identity", {}) 57 - 58 - names: list[str] = [] 59 - 60 - # Preferred name first (best for display) 61 - preferred = identity.get("preferred", "").strip() 62 - if preferred: 63 - names.append(preferred) 64 - 65 - # Full name 66 - name = identity.get("name", "").strip() 67 - if name and name not in names: 68 - names.append(name) 69 - 70 - # Aliases 71 - aliases = identity.get("aliases", []) 72 - if isinstance(aliases, list): 73 - for alias in aliases: 74 - if isinstance(alias, str): 75 - alias = alias.strip() 76 - if alias and alias not in names: 77 - names.append(alias) 78 - 79 - return names 80 - 81 - 82 - def entity_last_active_ts(entity: dict[str, Any]) -> int: 83 - """Get entity's last activity timestamp with fallback chain. 84 - 85 - Returns a Unix timestamp (milliseconds) representing when the entity was 86 - last active, using the following priority: 87 - 1. last_seen (YYYYMMDD string, converted to local midnight) 88 - 2. updated_at (Unix ms) 89 - 3. attached_at (Unix ms) 90 - 4. DEFAULT_ACTIVITY_TS (Jan 1 2026) 91 - 92 - This ensures all entities have a sortable timestamp value. 93 - 94 - Args: 95 - entity: Entity dictionary with optional last_seen, updated_at, attached_at fields 96 - 97 - Returns: 98 - Unix timestamp in milliseconds 99 - 100 - Examples: 101 - >>> entity_last_active_ts({"last_seen": "20260115"}) # Jan 15 2026 local midnight 102 - >>> entity_last_active_ts({"updated_at": 1700000000000}) 103 - 1700000000000 104 - >>> entity_last_active_ts({}) 105 - 1767225600000 # DEFAULT_ACTIVITY_TS (Jan 1 2026 UTC) 106 - """ 107 - from datetime import datetime 108 - 109 - # Priority 1: last_seen (YYYYMMDD string) 110 - last_seen = entity.get("last_seen") 111 - if last_seen and isinstance(last_seen, str) and len(last_seen) == 8: 112 - try: 113 - dt = datetime.strptime(last_seen, "%Y%m%d") 114 - return int(dt.timestamp() * 1000) 115 - except ValueError: 116 - pass # Malformed, fall through 117 - 118 - # Priority 2: updated_at 119 - updated_at = entity.get("updated_at") 120 - if updated_at and isinstance(updated_at, int) and updated_at > 0: 121 - return updated_at 122 - 123 - # Priority 3: attached_at 124 - attached_at = entity.get("attached_at") 125 - if attached_at and isinstance(attached_at, int) and attached_at > 0: 126 - return attached_at 127 - 128 - # Priority 4: Default 129 - return DEFAULT_ACTIVITY_TS 130 - 131 - 132 - def is_valid_entity_type(etype: str) -> bool: 133 - """Validate entity type: alphanumeric and spaces only, at least 3 characters.""" 134 - if not etype or len(etype.strip()) < 3: 135 - return False 136 - # Must contain only alphanumeric and spaces, and at least one alphanumeric character 137 - return bool( 138 - re.match(r"^[A-Za-z0-9 ]+$", etype) and re.search(r"[A-Za-z0-9]", etype) 139 - ) 140 - 141 - 142 - # Maximum length for entity slug before truncation 143 - MAX_ENTITY_SLUG_LENGTH = 200 144 - 145 - 146 - def entity_slug(name: str) -> str: 147 - """Generate a stable slug identifier for an entity name. 148 - 149 - The slug is used as: 150 - - The `id` field stored in entity records 151 - - Folder names for entity memory storage 152 - - URL-safe programmatic references 153 - 154 - Uses python-slugify to convert names to lowercase with underscores. 155 - Long names are truncated with a hash suffix to ensure uniqueness. 156 - 157 - Args: 158 - name: Entity name (e.g., "Alice Johnson", "Acme Corp") 159 - 160 - Returns: 161 - Slug identifier (e.g., "alice_johnson", "acme_corp") 162 - 163 - Examples: 164 - >>> entity_slug("Alice Johnson") 165 - 'alice_johnson' 166 - >>> entity_slug("O'Brien") 167 - 'o_brien' 168 - >>> entity_slug("AT&T") 169 - 'at_t' 170 - >>> entity_slug("José García") 171 - 'jose_garcia' 172 - """ 173 - if not name or not name.strip(): 174 - return "" 175 - 176 - # Use slugify with underscore separator 177 - slug = slugify(name, separator="_") 178 - 179 - # Handle very long names - truncate and add hash suffix 180 - if len(slug) > MAX_ENTITY_SLUG_LENGTH: 181 - # Create hash of full name for uniqueness 182 - name_hash = hashlib.md5(name.encode()).hexdigest()[:8] 183 - # Truncate and append hash 184 - slug = slug[: MAX_ENTITY_SLUG_LENGTH - 9] + "_" + name_hash 185 - 186 - return slug 187 - 188 - 189 - def entity_memory_path(facet: str, name: str) -> Path: 190 - """Return path to entity's memory folder. 191 - 192 - Entity memory folders store persistent data about attached entities: 193 - observations (durable facts), voiceprints (voice recognition), etc. 194 - 195 - Args: 196 - facet: Facet name (e.g., "personal", "work") 197 - name: Entity name (will be slugified) 198 - 199 - Returns: 200 - Path to facets/{facet}/entities/{entity_slug}/ 201 - 202 - Raises: 203 - ValueError: If name slugifies to empty string 204 - """ 205 - slug = entity_slug(name) 206 - if not slug: 207 - raise ValueError(f"Entity name '{name}' slugifies to empty string") 208 - 209 - return Path(get_journal()) / "facets" / facet / "entities" / slug 210 - 211 - 212 - def ensure_entity_memory(facet: str, name: str) -> Path: 213 - """Create entity memory folder if needed, return path. 214 - 215 - Args: 216 - facet: Facet name (e.g., "personal", "work") 217 - name: Entity name (will be slugified) 218 - 219 - Returns: 220 - Path to the created/existing folder 221 - 222 - Raises: 223 - ValueError: If name slugifies to empty string 224 - """ 225 - folder = entity_memory_path(facet, name) 226 - folder.mkdir(parents=True, exist_ok=True) 227 - return folder 228 - 229 - 230 - def rename_entity_memory(facet: str, old_name: str, new_name: str) -> bool: 231 - """Rename entity memory folder if it exists. 232 - 233 - Called when an entity is renamed to keep folder in sync. 234 - 235 - Args: 236 - facet: Facet name 237 - old_name: Previous entity name 238 - new_name: New entity name 239 - 240 - Returns: 241 - True if folder was renamed, False if old folder didn't exist 242 - or names slugify to the same value 243 - 244 - Raises: 245 - ValueError: If either name slugifies to empty string 246 - OSError: If rename fails (e.g., target exists) 247 - """ 248 - old_folder = entity_memory_path(facet, old_name) 249 - new_folder = entity_memory_path(facet, new_name) 250 - 251 - # No rename needed if slugified names are the same 252 - if old_folder == new_folder: 253 - return False 254 - 255 - if not old_folder.exists(): 256 - return False 257 - 258 - if new_folder.exists(): 259 - raise OSError(f"Target folder already exists: {new_folder}") 260 - 261 - shutil.move(str(old_folder), str(new_folder)) 262 - return True 263 - 264 - 265 - # ----------------------------------------------------------------------------- 266 - # Journal-Level Entity Functions 267 - # ----------------------------------------------------------------------------- 268 - 269 - 270 - def journal_entity_path(entity_id: str) -> Path: 271 - """Return path to journal-level entity file. 272 - 273 - Args: 274 - entity_id: Entity ID (slug) 275 - 276 - Returns: 277 - Path to entities/<id>/entity.json 278 - """ 279 - return Path(get_journal()) / "entities" / entity_id / "entity.json" 280 - 281 - 282 - def load_journal_entity(entity_id: str) -> dict[str, Any] | None: 283 - """Load a journal-level entity by ID. 284 - 285 - Args: 286 - entity_id: Entity ID (slug) 287 - 288 - Returns: 289 - Entity dict with id, name, type, aka, is_principal, created_at fields, 290 - or None if not found. 291 - """ 292 - path = journal_entity_path(entity_id) 293 - if not path.exists(): 294 - return None 295 - 296 - try: 297 - with open(path, "r", encoding="utf-8") as f: 298 - data = json.load(f) 299 - # Ensure id is present 300 - data["id"] = entity_id 301 - return data 302 - except (json.JSONDecodeError, OSError): 303 - return None 304 - 305 - 306 - def save_journal_entity(entity: dict[str, Any]) -> None: 307 - """Save a journal-level entity using atomic write. 308 - 309 - The entity must have an 'id' field. Creates the directory if needed. 310 - 311 - Args: 312 - entity: Entity dict with id, name, type, aka (optional), is_principal (optional), 313 - created_at fields. 314 - 315 - Raises: 316 - ValueError: If entity has no id field 317 - """ 318 - entity_id = entity.get("id") 319 - if not entity_id: 320 - raise ValueError("Entity must have an 'id' field") 321 - 322 - path = journal_entity_path(entity_id) 323 - path.parent.mkdir(parents=True, exist_ok=True) 324 - 325 - # Atomic write 326 - fd, temp_path = tempfile.mkstemp(dir=path.parent, prefix=".entity_", suffix=".tmp") 327 - try: 328 - with os.fdopen(fd, "w", encoding="utf-8") as f: 329 - json.dump(entity, f, ensure_ascii=False, indent=2) 330 - f.write("\n") 331 - os.replace(temp_path, path) 332 - except Exception: 333 - try: 334 - os.unlink(temp_path) 335 - except Exception: 336 - pass 337 - raise 338 - 339 - 340 - def scan_journal_entities() -> list[str]: 341 - """List all entity IDs from journal-level entities. 342 - 343 - Scans entities/ directory for subdirectories containing entity.json. 344 - 345 - Returns: 346 - List of entity IDs (directory names) 347 - """ 348 - entities_dir = Path(get_journal()) / "entities" 349 - if not entities_dir.exists(): 350 - return [] 351 - 352 - entity_ids = [] 353 - for entry in entities_dir.iterdir(): 354 - if entry.is_dir() and (entry / "entity.json").exists(): 355 - entity_ids.append(entry.name) 356 - 357 - return sorted(entity_ids) 358 - 359 - 360 - def load_all_journal_entities() -> dict[str, dict[str, Any]]: 361 - """Load all journal-level entities. 362 - 363 - Returns: 364 - Dict mapping entity_id to entity dict 365 - """ 366 - entity_ids = scan_journal_entities() 367 - entities = {} 368 - for entity_id in entity_ids: 369 - entity = load_journal_entity(entity_id) 370 - if entity: 371 - entities[entity_id] = entity 372 - return entities 373 - 374 - 375 - def has_journal_principal() -> bool: 376 - """Check if any journal entity is already flagged as principal. 377 - 378 - Returns: 379 - True if a principal entity exists, False otherwise 380 - """ 381 - for entity_id in scan_journal_entities(): 382 - entity = load_journal_entity(entity_id) 383 - if entity and entity.get("is_principal"): 384 - return True 385 - return False 386 - 387 - 388 - def _should_be_principal(name: str, aka: list[str] | None) -> bool: 389 - """Check if an entity should be flagged as principal based on identity config. 390 - 391 - Args: 392 - name: Entity name 393 - aka: Optional list of aliases 394 - 395 - Returns: 396 - True if the entity matches identity config, False otherwise 397 - """ 398 - identity_names = get_identity_names() 399 - if not identity_names: 400 - return False 401 - 402 - # Check if name or any aka matches identity 403 - names_to_check = [name.lower()] 404 - if aka: 405 - names_to_check.extend(a.lower() for a in aka) 406 - 407 - for identity_name in identity_names: 408 - if identity_name.lower() in names_to_check: 409 - return True 410 - 411 - return False 412 - 413 - 414 - def get_or_create_journal_entity( 415 - entity_id: str, 416 - name: str, 417 - entity_type: str, 418 - aka: list[str] | None = None, 419 - *, 420 - skip_principal: bool = False, 421 - ) -> dict[str, Any]: 422 - """Get existing journal entity or create new one. 423 - 424 - If entity exists, returns it unchanged (does not update fields). 425 - If entity doesn't exist, creates it with provided values. 426 - 427 - Args: 428 - entity_id: Entity ID (slug) 429 - name: Entity name 430 - entity_type: Entity type (e.g., "Person", "Company") 431 - aka: Optional list of aliases 432 - skip_principal: If True, don't flag as principal even if matches identity 433 - 434 - Returns: 435 - The existing or newly created entity dict 436 - """ 437 - existing = load_journal_entity(entity_id) 438 - if existing: 439 - return existing 440 - 441 - # Create new entity 442 - entity = { 443 - "id": entity_id, 444 - "name": name, 445 - "type": entity_type, 446 - "created_at": int(time.time() * 1000), 447 - } 448 - if aka: 449 - entity["aka"] = aka 450 - 451 - # Check if this should be the principal 452 - # Only flag if: matches identity, no existing principal, and not skipped 453 - if ( 454 - not skip_principal 455 - and _should_be_principal(name, aka) 456 - and not has_journal_principal() 457 - ): 458 - entity["is_principal"] = True 459 - 460 - save_journal_entity(entity) 461 - return entity 462 - 463 - 464 - # ----------------------------------------------------------------------------- 465 - # Facet Relationship Functions 466 - # ----------------------------------------------------------------------------- 467 - 468 - 469 - def facet_relationship_path(facet: str, entity_id: str) -> Path: 470 - """Return path to facet relationship file. 471 - 472 - Args: 473 - facet: Facet name 474 - entity_id: Entity ID (slug) 475 - 476 - Returns: 477 - Path to facets/<facet>/entities/<id>/entity.json 478 - """ 479 - return ( 480 - Path(get_journal()) / "facets" / facet / "entities" / entity_id / "entity.json" 481 - ) 482 - 483 - 484 - def load_facet_relationship(facet: str, entity_id: str) -> dict[str, Any] | None: 485 - """Load a facet relationship for an entity. 486 - 487 - Args: 488 - facet: Facet name 489 - entity_id: Entity ID (slug) 490 - 491 - Returns: 492 - Relationship dict with entity_id, description, timestamps, etc., 493 - or None if not found. 494 - """ 495 - path = facet_relationship_path(facet, entity_id) 496 - if not path.exists(): 497 - return None 498 - 499 - try: 500 - with open(path, "r", encoding="utf-8") as f: 501 - data = json.load(f) 502 - # Ensure entity_id is present 503 - data["entity_id"] = entity_id 504 - return data 505 - except (json.JSONDecodeError, OSError): 506 - return None 507 - 508 - 509 - def save_facet_relationship( 510 - facet: str, entity_id: str, relationship: dict[str, Any] 511 - ) -> None: 512 - """Save a facet relationship using atomic write. 513 - 514 - Creates the directory if needed. 515 - 516 - Args: 517 - facet: Facet name 518 - entity_id: Entity ID (slug) 519 - relationship: Relationship dict with description, timestamps, etc. 520 - """ 521 - path = facet_relationship_path(facet, entity_id) 522 - path.parent.mkdir(parents=True, exist_ok=True) 523 - 524 - # Ensure entity_id is in the relationship 525 - relationship["entity_id"] = entity_id 526 - 527 - # Atomic write 528 - fd, temp_path = tempfile.mkstemp( 529 - dir=path.parent, prefix=".relationship_", suffix=".tmp" 530 - ) 531 - try: 532 - with os.fdopen(fd, "w", encoding="utf-8") as f: 533 - json.dump(relationship, f, ensure_ascii=False, indent=2) 534 - f.write("\n") 535 - os.replace(temp_path, path) 536 - except Exception: 537 - try: 538 - os.unlink(temp_path) 539 - except Exception: 540 - pass 541 - raise 542 - 543 - 544 - def scan_facet_relationships(facet: str) -> list[str]: 545 - """List all entity IDs with relationships in a facet. 546 - 547 - Scans facets/<facet>/entities/ for subdirectories containing entity.json. 548 - 549 - Args: 550 - facet: Facet name 551 - 552 - Returns: 553 - List of entity IDs (directory names) 554 - """ 555 - entities_dir = Path(get_journal()) / "facets" / facet / "entities" 556 - if not entities_dir.exists(): 557 - return [] 558 - 559 - entity_ids = [] 560 - for entry in entities_dir.iterdir(): 561 - if entry.is_dir() and (entry / "entity.json").exists(): 562 - entity_ids.append(entry.name) 563 - 564 - return sorted(entity_ids) 565 - 566 - 567 - def _enrich_relationship_with_journal( 568 - relationship: dict[str, Any], 569 - journal_entity: dict[str, Any] | None, 570 - ) -> dict[str, Any]: 571 - """Merge journal entity fields into relationship for unified view. 572 - 573 - Creates a combined entity dict that looks like the legacy format, 574 - with identity fields (name, type, aka, is_principal) from journal 575 - and relationship fields (description, timestamps, etc.) from facet. 576 - 577 - Args: 578 - relationship: Facet relationship dict 579 - journal_entity: Journal-level entity dict (or None) 580 - 581 - Returns: 582 - Merged entity dict with all fields 583 - """ 584 - # Start with relationship data 585 - result = dict(relationship) 586 - 587 - # Add identity fields from journal entity 588 - if journal_entity: 589 - result["id"] = journal_entity.get("id", relationship.get("entity_id", "")) 590 - result["name"] = journal_entity.get("name", "") 591 - result["type"] = journal_entity.get("type", "") 592 - if journal_entity.get("aka"): 593 - result["aka"] = journal_entity["aka"] 594 - if journal_entity.get("is_principal"): 595 - result["is_principal"] = True 596 - else: 597 - # No journal entity - use entity_id as id 598 - result["id"] = relationship.get("entity_id", "") 599 - 600 - # Remove entity_id from result (use id instead) 601 - result.pop("entity_id", None) 602 - 603 - return result 604 - 605 - 606 - def parse_entity_file( 607 - file_path: str, *, validate_types: bool = True 608 - ) -> list[dict[str, Any]]: 609 - """Parse entities from a JSONL file. 610 - 611 - This is the low-level file parsing function used by all entity loading code. 612 - Each line in the file should be a JSON object with type, name, and description fields. 613 - 614 - Generates `id` field (slug) for entities that don't have one, enabling 615 - lazy migration of existing entity files. 616 - 617 - Args: 618 - file_path: Absolute path to entities.jsonl file 619 - validate_types: If True, filters out invalid entity types (default: True) 620 - 621 - Returns: 622 - List of entity dictionaries with id, type, name, and description keys 623 - 624 - Example: 625 - >>> parse_entity_file("/path/to/entities.jsonl") 626 - [{"id": "john_smith", "type": "Person", "name": "John Smith", "description": "Friend from college"}] 627 - """ 628 - if not os.path.isfile(file_path): 629 - return [] 630 - 631 - entities = [] 632 - with open(file_path, "r", encoding="utf-8") as f: 633 - for line in f: 634 - line = line.strip() 635 - if not line: 636 - continue 637 - try: 638 - data = json.loads(line) 639 - etype = data.get("type", "") 640 - name = data.get("name", "") 641 - desc = data.get("description", "") 642 - 643 - # Validate if requested 644 - if validate_types and not is_valid_entity_type(etype): 645 - continue 646 - 647 - # Generate id from name if not present (lazy migration) 648 - entity_id = data.get("id") or entity_slug(name) 649 - 650 - # Preserve all fields from JSON, ensuring core fields exist 651 - # Put id first for readability in JSONL output 652 - entity = { 653 - "id": entity_id, 654 - "type": etype, 655 - "name": name, 656 - "description": desc, 657 - } 658 - # Add any additional fields from the JSON 659 - for key, value in data.items(): 660 - if key not in entity: 661 - entity[key] = value 662 - 663 - entities.append(entity) 664 - except (json.JSONDecodeError, AttributeError): 665 - continue # Skip malformed lines 666 - 667 - return entities 668 - 669 - 670 - def entity_file_path(facet: str, day: Optional[str] = None) -> Path: 671 - """Return path to entity file for a facet. 672 - 673 - Args: 674 - facet: Facet name (e.g., "personal", "work") 675 - day: Optional day in YYYYMMDD format for detected entities 676 - 677 - Returns: 678 - Path to entities.jsonl (attached) or entities/YYYYMMDD.jsonl (detected) 679 - """ 680 - facet_path = Path(get_journal()) / "facets" / facet 681 - 682 - if day is None: 683 - # Attached entities 684 - return facet_path / "entities.jsonl" 685 - else: 686 - # Detected entities for specific day 687 - return facet_path / "entities" / f"{day}.jsonl" 688 - 689 - 690 - def _load_entities_new_structure( 691 - facet: str, *, include_detached: bool = False 692 - ) -> list[dict[str, Any]] | None: 693 - """Load attached entities from new structure (facet relationships + journal entities). 694 - 695 - Returns None if no new-structure entities exist (fall back to legacy). 696 - Returns list of enriched entities if any new-structure relationships found. 697 - """ 698 - entity_ids = scan_facet_relationships(facet) 699 - if not entity_ids: 700 - return None # No new structure, fall back to legacy 701 - 702 - # Load all journal entities for enrichment 703 - journal_entities = load_all_journal_entities() 704 - 705 - entities = [] 706 - for entity_id in entity_ids: 707 - relationship = load_facet_relationship(facet, entity_id) 708 - if relationship is None: 709 - continue 710 - 711 - # Skip detached if not requested 712 - if not include_detached and relationship.get("detached"): 713 - continue 714 - 715 - # Enrich with journal entity data 716 - journal_entity = journal_entities.get(entity_id) 717 - enriched = _enrich_relationship_with_journal(relationship, journal_entity) 718 - entities.append(enriched) 719 - 720 - return entities 721 - 722 - 723 - def _load_entities_legacy( 724 - facet: str, *, include_detached: bool = False 725 - ) -> list[dict[str, Any]]: 726 - """Load attached entities from legacy entities.jsonl file. 727 - 728 - Includes deduplication logic for self-healing corrupted files. 729 - """ 730 - import logging 731 - 732 - path = entity_file_path(facet, day=None) 733 - entities = parse_entity_file(str(path)) 734 - 735 - # Deduplicate by name (case-insensitive) to self-heal 736 - seen: dict[str, dict[str, Any]] = {} 737 - duplicates_found = [] 738 - 739 - for entity in entities: 740 - name = entity.get("name", "") 741 - name_lower = name.lower() 742 - 743 - if name_lower in seen: 744 - # Duplicate found - keep the one with most recent activity 745 - existing = seen[name_lower] 746 - existing_time = entity_last_active_ts(existing) 747 - current_time = entity_last_active_ts(entity) 748 - 749 - if current_time > existing_time: 750 - duplicates_found.append(existing.get("name", "")) 751 - seen[name_lower] = entity 752 - else: 753 - duplicates_found.append(name) 754 - else: 755 - seen[name_lower] = entity 756 - 757 - if duplicates_found: 758 - logging.info( 759 - f"Healed {len(duplicates_found)} duplicate entities in facet " 760 - f"'{facet}': {duplicates_found}" 761 - ) 762 - 763 - entities = list(seen.values()) 764 - 765 - # Filter out detached if not requested 766 - if not include_detached: 767 - entities = [e for e in entities if not e.get("detached")] 768 - 769 - return entities 770 - 771 - 772 - def load_entities( 773 - facet: str, day: Optional[str] = None, *, include_detached: bool = False 774 - ) -> list[dict[str, Any]]: 775 - """Load entities from facet. 776 - 777 - For attached entities (day=None), tries new structure first (facet relationships 778 - enriched with journal entities), then falls back to legacy entities.jsonl. 779 - 780 - For detected entities (day provided), loads from day-specific JSONL files. 781 - 782 - Args: 783 - facet: Facet name 784 - day: Optional day in YYYYMMDD format for detected entities 785 - include_detached: If True, includes entities with detached=True. 786 - Default False excludes detached entities. 787 - Only applies to attached entities (day=None). 788 - 789 - Returns: 790 - List of entity dictionaries with id, type, name, description, and other fields. 791 - 792 - Example: 793 - >>> load_entities("personal") 794 - [{"id": "john_smith", "type": "Person", "name": "John Smith", "description": "Friend"}] 795 - """ 796 - # For detected entities, use day-specific files (unchanged) 797 - if day is not None: 798 - path = entity_file_path(facet, day) 799 - return parse_entity_file(str(path)) 800 - 801 - # For attached entities, try new structure first 802 - entities = _load_entities_new_structure(facet, include_detached=include_detached) 803 - if entities is not None: 804 - return entities 805 - 806 - # Fall back to legacy structure 807 - return _load_entities_legacy(facet, include_detached=include_detached) 808 - 809 - 810 - def _ensure_principal_flag(entities: list[dict[str, Any]]) -> None: 811 - """Ensure exactly one entity is flagged as principal if one matches identity. 812 - 813 - Checks if any entity already has is_principal=True. If not, attempts to 814 - find an entity matching the journal identity config (name, preferred, aliases) 815 - and flags it as principal. 816 - 817 - This is called during save_entities() for attached entities only. 818 - Modifies entities in place. 819 - 820 - Args: 821 - entities: List of attached entity dicts (modified in place) 822 - """ 823 - # Check if any entity already has is_principal flag 824 - for entity in entities: 825 - if entity.get("is_principal"): 826 - return # Already have a principal, nothing to do 827 - 828 - # No principal flagged - try to find one matching identity 829 - identity_names = get_identity_names() 830 - if not identity_names: 831 - return # No identity configured 832 - 833 - # Build lookup for case-insensitive matching 834 - # Maps lowercase name/aka -> entity 835 - name_map: dict[str, dict[str, Any]] = {} 836 - for entity in entities: 837 - if entity.get("detached"): 838 - continue # Skip detached entities 839 - 840 - name = entity.get("name", "") 841 - if name: 842 - name_map[name.lower()] = entity 843 - 844 - # Also check akas 845 - aka_list = entity.get("aka", []) 846 - if isinstance(aka_list, list): 847 - for aka in aka_list: 848 - if aka: 849 - name_map[aka.lower()] = entity 850 - 851 - # Try to match identity names against entities 852 - for identity_name in identity_names: 853 - identity_lower = identity_name.lower() 854 - if identity_lower in name_map: 855 - # Found a match - flag as principal 856 - name_map[identity_lower]["is_principal"] = True 857 - return 858 - 859 - 860 - def _save_entities_detected( 861 - facet: str, entities: list[dict[str, Any]], day: str 862 - ) -> None: 863 - """Save detected entities to day-specific JSONL file.""" 864 - path = entity_file_path(facet, day) 865 - path.parent.mkdir(parents=True, exist_ok=True) 866 - 867 - # Ensure id field is present 868 - for entity in entities: 869 - name = entity.get("name", "") 870 - expected_id = entity_slug(name) 871 - if entity.get("id") != expected_id: 872 - entity["id"] = expected_id 873 - 874 - # Sort by type, then name for consistency 875 - sorted_entities = sorted( 876 - entities, key=lambda e: (e.get("type", ""), e.get("name", "")) 877 - ) 878 - 879 - # Format as JSONL and write atomically 880 - lines = [json.dumps(e, ensure_ascii=False) + "\n" for e in sorted_entities] 881 - 882 - fd, temp_path = tempfile.mkstemp( 883 - dir=path.parent, prefix=".entities_", suffix=".tmp" 884 - ) 885 - try: 886 - with os.fdopen(fd, "w", encoding="utf-8") as f: 887 - f.writelines(lines) 888 - os.replace(temp_path, path) 889 - except Exception: 890 - try: 891 - os.unlink(temp_path) 892 - except Exception: 893 - pass 894 - raise 895 - 896 - 897 - def _save_entities_attached(facet: str, entities: list[dict[str, Any]]) -> None: 898 - """Save attached entities to new structure (journal entities + facet relationships).""" 899 - # Validate uniqueness 900 - seen_names: set[str] = set() 901 - seen_ids: set[str] = set() 902 - 903 - for entity in entities: 904 - name = entity.get("name", "") 905 - expected_id = entity_slug(name) 906 - 907 - # Set or update id 908 - if entity.get("id") != expected_id: 909 - entity["id"] = expected_id 910 - 911 - name_lower = name.lower() 912 - if name_lower in seen_names: 913 - raise ValueError(f"Duplicate entity name '{name}' in facet '{facet}'") 914 - seen_names.add(name_lower) 915 - 916 - if expected_id in seen_ids: 917 - raise ValueError( 918 - f"Duplicate entity id '{expected_id}' in facet '{facet}' " 919 - f"(names may slugify to same value)" 920 - ) 921 - seen_ids.add(expected_id) 922 - 923 - # Fields that belong to journal entity (identity) 924 - journal_fields = {"id", "name", "type", "aka", "is_principal", "created_at"} 925 - 926 - # Process each entity 927 - for entity in entities: 928 - entity_id = entity["id"] 929 - name = entity.get("name", "") 930 - entity_type = entity.get("type", "") 931 - aka = entity.get("aka") 932 - is_detached = entity.get("detached", False) 933 - 934 - # Ensure journal entity exists (creates if needed, preserves if exists) 935 - # Skip principal flagging for detached entities 936 - journal_entity = get_or_create_journal_entity( 937 - entity_id=entity_id, 938 - name=name, 939 - entity_type=entity_type, 940 - aka=aka if isinstance(aka, list) else None, 941 - skip_principal=is_detached, 942 - ) 943 - 944 - # Update journal entity if name/type/aka changed 945 - journal_updated = False 946 - if journal_entity.get("name") != name: 947 - journal_entity["name"] = name 948 - journal_updated = True 949 - if journal_entity.get("type") != entity_type: 950 - journal_entity["type"] = entity_type 951 - journal_updated = True 952 - if aka and isinstance(aka, list): 953 - # Merge aka lists (union) 954 - existing_aka = set(journal_entity.get("aka", [])) 955 - new_aka = existing_aka | set(aka) 956 - if new_aka != existing_aka: 957 - journal_entity["aka"] = sorted(new_aka) 958 - journal_updated = True 959 - # Only propagate is_principal if explicitly set and entity not detached 960 - if ( 961 - entity.get("is_principal") 962 - and not is_detached 963 - and not journal_entity.get("is_principal") 964 - ): 965 - journal_entity["is_principal"] = True 966 - journal_updated = True 967 - 968 - if journal_updated: 969 - save_journal_entity(journal_entity) 970 - 971 - # Build relationship record (all non-identity fields) 972 - relationship = { 973 - "entity_id": entity_id, 974 - } 975 - for key, value in entity.items(): 976 - if key not in journal_fields: 977 - relationship[key] = value 978 - 979 - # Save facet relationship 980 - save_facet_relationship(facet, entity_id, relationship) 981 - 982 - 983 - def save_entities( 984 - facet: str, entities: list[dict[str, Any]], day: Optional[str] = None 985 - ) -> None: 986 - """Save entities to new structure. 987 - 988 - For detected entities (day provided), writes to day-specific JSONL files. 989 - For attached entities (day=None), writes to: 990 - - Journal-level entity files: entities/<id>/entity.json (identity) 991 - - Facet relationship files: facets/<facet>/entities/<id>/entity.json 992 - 993 - Ensures all entities have an `id` field (generates from name if missing). 994 - For attached entities, validates name uniqueness within the facet and 995 - ensures the principal entity is flagged at the journal level. 996 - 997 - Args: 998 - facet: Facet name 999 - entities: List of entity dictionaries (must have type, name, description keys; 1000 - attached entities may also have id, attached_at, updated_at timestamps) 1001 - day: Optional day in YYYYMMDD format for detected entities 1002 - 1003 - Raises: 1004 - ValueError: If duplicate names found in attached entities (day=None) 1005 - """ 1006 - if day is not None: 1007 - _save_entities_detected(facet, entities, day) 1008 - else: 1009 - _save_entities_attached(facet, entities) 1010 - 1011 - 1012 - def update_entity_description( 1013 - facet: str, 1014 - name: str, 1015 - old_description: str, 1016 - new_description: str, 1017 - day: Optional[str] = None, 1018 - ) -> dict[str, Any]: 1019 - """Update an entity's description after validating current state. 1020 - 1021 - Sets updated_at timestamp to current time on successful update. 1022 - 1023 - Args: 1024 - facet: Facet name 1025 - name: Entity name to match (unique within facet) 1026 - old_description: Current description (guard - must match) 1027 - new_description: New description to set 1028 - day: Optional day for detected entities 1029 - 1030 - Returns: 1031 - The updated entity dict 1032 - 1033 - Raises: 1034 - ValueError: If entity not found or guard mismatch 1035 - """ 1036 - # Load ALL entities including detached to avoid data loss on save 1037 - # For attached entities (day=None), we need include_detached=True 1038 - entities = ( 1039 - load_entities(facet, day, include_detached=True) 1040 - if day is None 1041 - else load_entities(facet, day) 1042 - ) 1043 - 1044 - for entity in entities: 1045 - # Skip detached entities when searching 1046 - if entity.get("detached"): 1047 - continue 1048 - if entity.get("name") == name: 1049 - current_desc = entity.get("description", "") 1050 - if current_desc != old_description: 1051 - raise ValueError( 1052 - f"Description mismatch for '{name}': expected '{old_description}', " 1053 - f"found '{current_desc}'" 1054 - ) 1055 - entity["description"] = new_description 1056 - entity["updated_at"] = int(time.time() * 1000) 1057 - save_entities(facet, entities, day) 1058 - return entity 1059 - 1060 - raise ValueError(f"Entity '{name}' not found in facet '{facet}'") 1061 - 1062 - 1063 - def load_all_attached_entities( 1064 - *, 1065 - sort_by: str | None = None, 1066 - limit: int | None = None, 1067 - ) -> list[dict[str, Any]]: 1068 - """Load all attached entities from all facets with deduplication. 1069 - 1070 - Iterates facets in sorted (alphabetical) order. When the same entity 1071 - ID appears in multiple facets, keeps the first occurrence. 1072 - 1073 - Uses load_entities() for each facet, which handles both new structure 1074 - (journal entities + facet relationships) and legacy entities.jsonl. 1075 - 1076 - Args: 1077 - sort_by: Optional field to sort by. Currently supports "last_seen" 1078 - which sorts by recency (entities without the field go to end). 1079 - limit: Optional maximum number of entities to return (applied after 1080 - deduplication and sorting). 1081 - 1082 - Returns: 1083 - List of entity dictionaries, deduplicated by id 1084 - 1085 - Example: 1086 - >>> load_all_attached_entities() 1087 - [{"id": "john_smith", "type": "Person", "name": "John Smith", ...}, ...] 1088 - 1089 - >>> load_all_attached_entities(sort_by="last_seen", limit=20) 1090 - # Returns 20 most recently seen entities 1091 - 1092 - Note: 1093 - Used for agent context loading. Provides deterministic behavior 1094 - despite allowing independent entity descriptions across facets. 1095 - """ 1096 - facets_dir = Path(get_journal()) / "facets" 1097 - if not facets_dir.exists(): 1098 - return [] 1099 - 1100 - # Track seen IDs for deduplication (use ID instead of name for uniqueness) 1101 - seen_ids: set[str] = set() 1102 - all_entities: list[dict[str, Any]] = [] 1103 - 1104 - # Process facets in sorted order for deterministic results 1105 - for facet_path in sorted(facets_dir.iterdir()): 1106 - if not facet_path.is_dir(): 1107 - continue 1108 - 1109 - facet_name = facet_path.name 1110 - 1111 - # Use load_entities which handles both new and legacy structures 1112 - for entity in load_entities(facet_name, include_detached=False): 1113 - entity_id = entity.get("id", "") 1114 - # Keep first occurrence only (deduplicate by ID) 1115 - if entity_id and entity_id not in seen_ids: 1116 - seen_ids.add(entity_id) 1117 - all_entities.append(entity) 1118 - 1119 - # Sort if requested 1120 - if sort_by == "last_seen": 1121 - # Sort by activity timestamp descending (uses full fallback chain) 1122 - all_entities.sort( 1123 - key=entity_last_active_ts, 1124 - reverse=True, 1125 - ) 1126 - 1127 - # Apply limit if requested 1128 - if limit is not None and limit > 0: 1129 - all_entities = all_entities[:limit] 1130 - 1131 - return all_entities 1132 - 1133 - 1134 - def _extract_spoken_names(entities: list[dict[str, Any]]) -> list[str]: 1135 - """Extract spoken-form names from entity list. 1136 - 1137 - Extracts shortened forms optimized for audio transcription: 1138 - - First word from base name (without parentheses) 1139 - - All items from within parentheses (comma-separated) 1140 - 1141 - Examples: 1142 - - "Ryan Reed (R2)" → ["Ryan", "R2"] 1143 - - "Federal Aviation Administration (FAA)" → ["Federal", "FAA"] 1144 - - "Acme Corp" → ["Acme"] 1145 - 1146 - Args: 1147 - entities: List of entity dictionaries with "name" and optional "aka" fields 1148 - 1149 - Returns: 1150 - List of unique spoken names, preserving insertion order 1151 - """ 1152 - spoken_names: list[str] = [] 1153 - 1154 - def add_name_variants(name: str) -> None: 1155 - """Extract and add first word + parenthetical items from a name.""" 1156 - if not name: 1157 - return 1158 - 1159 - # Get base name (without parens) and extract first word 1160 - base_name = re.sub(r"\s*$[^)]+$", "", name).strip() 1161 - first_word = base_name.split()[0] if base_name else None 1162 - 1163 - # Add first word 1164 - if first_word and first_word not in spoken_names: 1165 - spoken_names.append(first_word) 1166 - 1167 - # Extract and add all items from parens (comma-separated) 1168 - paren_match = re.search(r"$([^)]+)$", name) 1169 - if paren_match: 1170 - paren_items = [item.strip() for item in paren_match.group(1).split(",")] 1171 - for item in paren_items: 1172 - if item and item not in spoken_names: 1173 - spoken_names.append(item) 1174 - 1175 - for entity in entities: 1176 - name = entity.get("name", "") 1177 - if name: 1178 - add_name_variants(name) 1179 - 1180 - # Process aka list with same logic 1181 - aka_list = entity.get("aka", []) 1182 - if isinstance(aka_list, list): 1183 - for aka_name in aka_list: 1184 - add_name_variants(aka_name) 1185 - 1186 - return spoken_names 1187 - 1188 - 1189 - def load_entity_names( 1190 - *, 1191 - facet: str | None = None, 1192 - spoken: bool = False, 1193 - ) -> str | list[str] | None: 1194 - """Load entity names from entities.jsonl for AI transcription context. 1195 - 1196 - This function extracts just the entity names (no types or descriptions) from 1197 - entity files. When spoken=False (default), returns them as a 1198 - semicolon-delimited string. When spoken=True, returns a list of shortened forms 1199 - optimized for audio transcription. 1200 - 1201 - When facet is None, loads and merges entities from ALL facets with 1202 - deduplication (first occurrence wins when same name appears in multiple facets). 1203 - Falls back to top-level entities.jsonl if no facets exist. 1204 - 1205 - When spoken=True, uses uniform processing for all entity types: 1206 - - Extracts first word from base name (without parentheses) 1207 - - Extracts all items from within parentheses (comma-separated) 1208 - - Examples: 1209 - - "Ryan Reed (R2)" → ["Ryan", "R2"] 1210 - - "Federal Aviation Administration (FAA)" → ["Federal", "FAA"] 1211 - - "Acme Corp" → ["Acme"] 1212 - - "pytest" → ["pytest"] 1213 - 1214 - Args: 1215 - facet: Optional facet name. If provided, loads from facets/{facet}/entities.jsonl 1216 - If None, loads from ALL facets using load_all_attached_entities(). 1217 - spoken: If True, returns list of shortened forms for speech recognition. 1218 - If False, returns semicolon-delimited string of full names. 1219 - 1220 - Returns: 1221 - When spoken=False: Semicolon-delimited string of entity names with aka values in parentheses 1222 - (e.g., "John Smith (Johnny); Acme Corp (ACME, AcmeCo)"), 1223 - or None if no entities found. 1224 - When spoken=True: List of shortened entity names for speech, or None if no entities found. 1225 - """ 1226 - # Load entities using existing utilities 1227 - if facet is None: 1228 - # Load from ALL facets with deduplication 1229 - entities = load_all_attached_entities() 1230 - else: 1231 - # Load from specific facet 1232 - entities = load_entities(facet) 1233 - 1234 - if not entities: 1235 - return None 1236 - 1237 - # Transform entity dicts into desired format 1238 - if not spoken: 1239 - # Non-spoken mode: semicolon-delimited string of full names with aka in parentheses 1240 - entity_names = [] 1241 - for entity in entities: 1242 - name = entity.get("name", "") 1243 - if name and name not in entity_names: 1244 - # Check for aka values and append in parentheses 1245 - aka_list = entity.get("aka", []) 1246 - if isinstance(aka_list, list) and aka_list: 1247 - # Format: "Name (aka1, aka2, aka3)" 1248 - aka_str = ", ".join(aka_list) 1249 - formatted_name = f"{name} ({aka_str})" 1250 - else: 1251 - formatted_name = name 1252 - entity_names.append(formatted_name) 1253 - return "; ".join(entity_names) if entity_names else None 1254 - else: 1255 - # Spoken mode: list of shortened forms 1256 - spoken_names = _extract_spoken_names(entities) 1257 - return spoken_names if spoken_names else None 1258 - 1259 - 1260 - def load_recent_entity_names(*, limit: int = 20) -> list[str] | None: 1261 - """Load recently active entity names for transcription context. 1262 - 1263 - Returns spoken-form names from the most recently seen entities across all 1264 - facets. Caller is responsible for formatting the list as needed. 1265 - 1266 - Args: 1267 - limit: Maximum number of entities to include (default 20) 1268 - 1269 - Returns: 1270 - List of spoken-form entity names, or None if no entities found. 1271 - 1272 - Example: 1273 - >>> load_recent_entity_names(limit=5) 1274 - ["Alice", "Bob", "R2", "Acme", "FAA"] 1275 - """ 1276 - # Get most recently seen entities 1277 - entities = load_all_attached_entities(sort_by="last_seen", limit=limit) 1278 - if not entities: 1279 - return None 1280 - 1281 - # Extract spoken names 1282 - spoken_names = _extract_spoken_names(entities) 1283 - if not spoken_names: 1284 - return None 1285 - 1286 - return spoken_names 1287 - 1288 - 1289 - def validate_aka_uniqueness( 1290 - aka: str, 1291 - entities: list[dict[str, Any]], 1292 - exclude_entity_name: str | None = None, 1293 - fuzzy_threshold: int = 90, 1294 - ) -> str | None: 1295 - """Check if an aka collides with another entity's name or aka. 1296 - 1297 - Uses the same fuzzy matching logic as find_matching_attached_entity to 1298 - catch collisions that would cause ambiguous lookups. 1299 - 1300 - Args: 1301 - aka: The alias to validate 1302 - entities: List of entity dicts to check against 1303 - exclude_entity_name: Entity name to exclude from checks (the entity 1304 - being updated). Case-sensitive exact match. 1305 - fuzzy_threshold: Minimum score for fuzzy matching (default: 90) 1306 - 1307 - Returns: 1308 - Name of conflicting entity if collision found, None if ok 1309 - 1310 - Example: 1311 - >>> entities = [{"name": "CTT", ...}, {"name": "Other", ...}] 1312 - >>> validate_aka_uniqueness("CTT", entities, exclude_entity_name="Other") 1313 - "CTT" # Conflicts with entity named "CTT" 1314 - >>> validate_aka_uniqueness("ctt", entities, exclude_entity_name="CTT") 1315 - None # Ok, adding to CTT's own akas 1316 - """ 1317 - # Filter out the entity being updated 1318 - check_entities = [ 1319 - e 1320 - for e in entities 1321 - if e.get("name") != exclude_entity_name and not e.get("detached") 1322 - ] 1323 - 1324 - if not check_entities: 1325 - return None 1326 - 1327 - # Use the existing matching function to detect collisions 1328 - match = find_matching_attached_entity(aka, check_entities, fuzzy_threshold) 1329 - if match: 1330 - return match.get("name") 1331 - 1332 - return None 1333 - 1334 - 1335 - def find_matching_attached_entity( 1336 - detected_name: str, 1337 - attached_entities: list[dict[str, Any]], 1338 - fuzzy_threshold: int = 90, 1339 - ) -> dict[str, Any] | None: 1340 - """Find an attached entity matching a detected name. 1341 - 1342 - Uses tiered matching strategy (in order of precedence): 1343 - 1. Exact name, id, or aka match 1344 - 2. Case-insensitive name, id, or aka match 1345 - 3. Slugified query match against id 1346 - 4. First-word match (unambiguous only, min 3 chars) 1347 - 5. Fuzzy match using rapidfuzz (score >= threshold) 1348 - 1349 - Args: 1350 - detected_name: Name, id (slug), or aka to search for 1351 - attached_entities: List of attached entity dicts to search 1352 - fuzzy_threshold: Minimum score (0-100) for fuzzy matching (default: 90) 1353 - 1354 - Returns: 1355 - Matched entity dict, or None if no match found 1356 - 1357 - Example: 1358 - >>> attached = [{"id": "robert_johnson", "name": "Robert Johnson", "aka": ["Bob", "Bobby"]}] 1359 - >>> find_matching_attached_entity("Bob", attached) 1360 - {"id": "robert_johnson", "name": "Robert Johnson", "aka": ["Bob", "Bobby"]} 1361 - >>> find_matching_attached_entity("robert_johnson", attached) 1362 - {"id": "robert_johnson", "name": "Robert Johnson", "aka": ["Bob", "Bobby"]} 1363 - """ 1364 - if not detected_name or not attached_entities: 1365 - return None 1366 - 1367 - detected_lower = detected_name.lower() 1368 - detected_slug = entity_slug(detected_name) 1369 - 1370 - # Build lookup structures for efficient matching 1371 - # Maps exact name/id/aka -> entity 1372 - exact_map: dict[str, dict[str, Any]] = {} 1373 - # Maps id -> entity for slug matching 1374 - id_map: dict[str, dict[str, Any]] = {} 1375 - # Maps lowercase first word -> list of entities (for ambiguity detection) 1376 - first_word_map: dict[str, list[dict[str, Any]]] = {} 1377 - # All candidate strings for fuzzy matching -> entity 1378 - fuzzy_candidates: dict[str, dict[str, Any]] = {} 1379 - 1380 - for entity in attached_entities: 1381 - name = entity.get("name", "") 1382 - entity_id = entity.get("id", "") 1383 - if not name: 1384 - continue 1385 - 1386 - name_lower = name.lower() 1387 - 1388 - # Tier 1 & 2: Exact and case-insensitive for name 1389 - exact_map[name] = entity 1390 - exact_map[name_lower] = entity 1391 - 1392 - # Also add id to exact map (compute from name if not present) 1393 - if entity_id: 1394 - exact_map[entity_id] = entity 1395 - id_map[entity_id] = entity 1396 - else: 1397 - # Compute slug from name for entities without id 1398 - name_slug = entity_slug(name) 1399 - if name_slug: 1400 - id_map[name_slug] = entity 1401 - 1402 - # Also add akas 1403 - aka_list = entity.get("aka", []) 1404 - if isinstance(aka_list, list): 1405 - for aka in aka_list: 1406 - if aka: 1407 - exact_map[aka] = entity 1408 - exact_map[aka.lower()] = entity 1409 - 1410 - # Tier 4: First word 1411 - first_word = name.split()[0].lower() if name else "" 1412 - if first_word and len(first_word) >= 3: 1413 - if first_word not in first_word_map: 1414 - first_word_map[first_word] = [] 1415 - first_word_map[first_word].append(entity) 1416 - 1417 - # Tier 5: Fuzzy candidates (name and akas) 1418 - fuzzy_candidates[name] = entity 1419 - if isinstance(aka_list, list): 1420 - for aka in aka_list: 1421 - if aka: 1422 - fuzzy_candidates[aka] = entity 1423 - 1424 - # Tier 1: Exact match (name, id, or aka) 1425 - if detected_name in exact_map: 1426 - return exact_map[detected_name] 1427 - 1428 - # Tier 2: Case-insensitive match 1429 - if detected_lower in exact_map: 1430 - return exact_map[detected_lower] 1431 - 1432 - # Tier 3: Slugified query match against id 1433 - if detected_slug and detected_slug in id_map: 1434 - return id_map[detected_slug] 1435 - 1436 - # Tier 4: First-word match (only if unambiguous) 1437 - if len(detected_name) >= 3: 1438 - matches = first_word_map.get(detected_lower, []) 1439 - if len(matches) == 1: 1440 - return matches[0] 1441 - 1442 - # Tier 5: Fuzzy match 1443 - if len(detected_name) >= 4 and fuzzy_candidates: 1444 - try: 1445 - from rapidfuzz import fuzz, process 1446 - 1447 - result = process.extractOne( 1448 - detected_name, 1449 - fuzzy_candidates.keys(), 1450 - scorer=fuzz.token_sort_ratio, 1451 - score_cutoff=fuzzy_threshold, 1452 - ) 1453 - if result: 1454 - matched_str, _score, _index = result 1455 - return fuzzy_candidates[matched_str] 1456 - except ImportError: 1457 - # rapidfuzz not available, skip fuzzy matching 1458 - pass 1459 - 1460 - return None 1461 - 1462 - 1463 - def resolve_entity( 1464 - facet: str, 1465 - query: str, 1466 - fuzzy_threshold: int = 90, 1467 - include_detached: bool = False, 1468 - ) -> tuple[dict[str, Any] | None, list[dict[str, Any]] | None]: 1469 - """Resolve an entity query to a single attached entity. 1470 - 1471 - This is the primary entry point for MCP tools to look up entities. 1472 - Accepts any form of entity reference (name, id/slug, aka) and resolves 1473 - to a single unambiguous entity. 1474 - 1475 - Uses tiered matching strategy: 1476 - 1. Exact name, id, or aka match 1477 - 2. Case-insensitive match 1478 - 3. Slugified query match against id 1479 - 4. First-word match (only if unambiguous) 1480 - 5. Fuzzy match (if single result above threshold) 1481 - 1482 - Args: 1483 - facet: Facet name (e.g., "personal", "work") 1484 - query: Name, id (slug), or aka to search for 1485 - fuzzy_threshold: Minimum score (0-100) for fuzzy matching (default: 90) 1486 - include_detached: If True, also search detached entities (default: False) 1487 - 1488 - Returns: 1489 - Tuple of (entity, candidates): 1490 - - If found: (entity_dict, None) 1491 - - If not found: (None, list of closest candidates) 1492 - - If ambiguous: (None, list of matching candidates) 1493 - 1494 - Examples: 1495 - >>> entity, _ = resolve_entity("work", "Alice Johnson") 1496 - >>> entity, _ = resolve_entity("work", "alice_johnson") # by id 1497 - >>> entity, _ = resolve_entity("work", "Ali") # by aka 1498 - >>> _, candidates = resolve_entity("work", "unknown") # not found 1499 - """ 1500 - if not query or not query.strip(): 1501 - return None, [] 1502 - 1503 - # Load attached entities 1504 - entities = load_entities(facet, day=None, include_detached=include_detached) 1505 - if not entities: 1506 - return None, [] 1507 - 1508 - # Try to find a match 1509 - match = find_matching_attached_entity(query, entities, fuzzy_threshold) 1510 - if match: 1511 - return match, None 1512 - 1513 - # No match found - find closest candidates for error message 1514 - # Get top fuzzy matches as suggestions 1515 - candidates: list[dict[str, Any]] = [] 1516 - 1517 - try: 1518 - from rapidfuzz import fuzz, process 1519 - 1520 - # Build candidate strings 1521 - fuzzy_candidates: dict[str, dict[str, Any]] = {} 1522 - for entity in entities: 1523 - name = entity.get("name", "") 1524 - if name: 1525 - fuzzy_candidates[name] = entity 1526 - aka_list = entity.get("aka", []) 1527 - if isinstance(aka_list, list): 1528 - for aka in aka_list: 1529 - if aka: 1530 - fuzzy_candidates[aka] = entity 1531 - 1532 - # Get top 3 matches regardless of threshold 1533 - results = process.extract( 1534 - query, 1535 - fuzzy_candidates.keys(), 1536 - scorer=fuzz.token_sort_ratio, 1537 - limit=3, 1538 - ) 1539 - seen_names: set[str] = set() 1540 - for matched_str, _score, _index in results: 1541 - entity = fuzzy_candidates[matched_str] 1542 - name = entity.get("name", "") 1543 - if name and name not in seen_names: 1544 - seen_names.add(name) 1545 - candidates.append(entity) 1546 - except ImportError: 1547 - # rapidfuzz not available, return first few entities as candidates 1548 - candidates = entities[:3] 1549 - 1550 - return None, candidates 1551 - 1552 - 1553 - def touch_entity(facet: str, name: str, day: str) -> str: 1554 - """Update last_seen timestamp on an attached entity. 1555 - 1556 - Sets the last_seen field to the provided day if the entity exists 1557 - and either has no last_seen or the new day is more recent. 1558 - 1559 - Args: 1560 - facet: Facet name 1561 - name: Exact name of the attached entity to touch 1562 - day: Day string in YYYYMMDD format 1563 - 1564 - Returns: 1565 - "updated" if entity was found and last_seen was updated, 1566 - "skipped" if entity was found but day is not more recent, 1567 - "not_found" if entity was not found 1568 - 1569 - Example: 1570 - >>> touch_entity("work", "Alice Johnson", "20250115") 1571 - "updated" 1572 - """ 1573 - # Load ALL attached entities including detached to avoid data loss on save 1574 - entities = load_entities(facet, day=None, include_detached=True) 1575 - 1576 - for entity in entities: 1577 - # Skip detached entities 1578 - if entity.get("detached"): 1579 - continue 1580 - if entity.get("name") == name: 1581 - current_last_seen = entity.get("last_seen", "") 1582 - # Only update if new day is more recent (or no existing last_seen) 1583 - if not current_last_seen or day > current_last_seen: 1584 - entity["last_seen"] = day 1585 - save_entities(facet, entities, day=None) 1586 - return "updated" 1587 - # Entity found but day is not more recent 1588 - return "skipped" 1589 - 1590 - return "not_found" 1591 - 1592 - 1593 - def parse_knowledge_graph_entities(day: str) -> list[str]: 1594 - """Parse entity names from a day's knowledge graph. 1595 - 1596 - Extracts entity names from markdown tables in the knowledge graph insight. 1597 - Entity names appear in bold (**Name**) in the first column of tables. 1598 - 1599 - Args: 1600 - day: Day string in YYYYMMDD format 1601 - 1602 - Returns: 1603 - List of unique entity names found in the knowledge graph. 1604 - Returns empty list if KG doesn't exist or can't be parsed. 1605 - 1606 - Example: 1607 - >>> parse_knowledge_graph_entities("20260108") 1608 - ["Jeremie Miller (Jer)", "Neal Satterfield", "Flightline", ...] 1609 - """ 1610 - journal = get_journal() 1611 - kg_path = Path(journal) / day / "insights" / "knowledge_graph.md" 1612 - 1613 - if not kg_path.exists(): 1614 - return [] 1615 - 1616 - try: 1617 - content = kg_path.read_text(encoding="utf-8") 1618 - except (OSError, UnicodeDecodeError): 1619 - return [] 1620 - 1621 - # Extract bold names from first column of markdown tables 1622 - # Pattern matches: | **Name** | ... (first column of table rows) 1623 - # Also matches relationship mapping tables: | **Name** | **Target** | ... 1624 - entity_names: set[str] = set() 1625 - 1626 - # Match table rows with bold text in first or second column 1627 - # Format: | **Entity Name** | Type | ... or | **Source** | **Target** | ... 1628 - table_row_pattern = re.compile(r"^\|\s*\*\*(.+?)\*\*\s*\|", re.MULTILINE) 1629 - 1630 - for match in table_row_pattern.finditer(content): 1631 - name = match.group(1).strip() 1632 - if name: 1633 - entity_names.add(name) 1634 - 1635 - # Also extract targets from relationship mapping (second column) 1636 - # Format: | **Source** | **Target** | Relationship | ... 1637 - relationship_pattern = re.compile( 1638 - r"^\|\s*\*\*.+?\*\*\s*\|\s*\*\*(.+?)\*\*\s*\|", re.MULTILINE 1639 - ) 1640 - 1641 - for match in relationship_pattern.finditer(content): 1642 - name = match.group(1).strip() 1643 - if name: 1644 - entity_names.add(name) 1645 - 1646 - return list(entity_names) 1647 - 1648 - 1649 - def touch_entities_from_activity( 1650 - facet: str, names: list[str], day: str 1651 - ) -> dict[str, Any]: 1652 - """Update last_seen for attached entities matching activity names. 1653 - 1654 - For each name in the activity list, attempts to find a matching 1655 - attached entity using fuzzy matching and updates its last_seen field. 1656 - 1657 - Args: 1658 - facet: Facet name 1659 - names: List of entity names from activity (e.g., knowledge graph) 1660 - day: Day string in YYYYMMDD format 1661 - 1662 - Returns: 1663 - Summary dict with: 1664 - - matched: List of (activity_name, attached_name) tuples for matches found 1665 - - updated: List of attached entity names that were updated 1666 - - skipped: List of attached entity names already up-to-date 1667 - 1668 - Example: 1669 - >>> touch_entities_from_activity("work", ["Bob", "FAA"], "20260108") 1670 - {"matched": [("Bob", "Robert Johnson"), ("FAA", "Federal Aviation Administration")], 1671 - "updated": ["Robert Johnson", "Federal Aviation Administration"], 1672 - "skipped": []} 1673 - """ 1674 - if not names: 1675 - return {"matched": [], "updated": [], "skipped": []} 1676 - 1677 - # Load attached entities (excluding detached) 1678 - attached = load_entities(facet, day=None, include_detached=False) 1679 - if not attached: 1680 - return {"matched": [], "updated": [], "skipped": []} 1681 - 1682 - # Track matches and which entities need updating 1683 - matched: list[tuple[str, str]] = [] 1684 - needs_update: dict[str, str] = {} # attached_name -> most_recent_day 1685 - 1686 - for activity_name in names: 1687 - entity = find_matching_attached_entity(activity_name, attached) 1688 - if entity: 1689 - attached_name = entity.get("name", "") 1690 - if attached_name: 1691 - matched.append((activity_name, attached_name)) 1692 - # Track the day for this entity (may be touched multiple times) 1693 - current = needs_update.get(attached_name, "") 1694 - if not current or day > current: 1695 - needs_update[attached_name] = day 1696 - 1697 - # Now batch the updates 1698 - updated: list[str] = [] 1699 - skipped: list[str] = [] 1700 - 1701 - for attached_name, update_day in needs_update.items(): 1702 - result = touch_entity(facet, attached_name, update_day) 1703 - if result == "updated": 1704 - updated.append(attached_name) 1705 - else: 1706 - # "skipped" (already up-to-date) or "not_found" 1707 - skipped.append(attached_name) 1708 - 1709 - return {"matched": matched, "updated": updated, "skipped": skipped} 1710 - 1711 - 1712 - def load_detected_entities_recent(facet: str, days: int = 30) -> list[dict[str, Any]]: 1713 - """Load detected entities from last N days, excluding those matching attached entities. 1714 - 1715 - Scans detected entity files in reverse chronological order (newest first), 1716 - aggregating by (type, name) to provide count and last_seen tracking. 1717 - 1718 - Uses fuzzy matching to exclude detected entities that match attached entities 1719 - by name, aka, normalized form, first word, or fuzzy similarity. 1720 - 1721 - Args: 1722 - facet: Facet name 1723 - days: Number of days to look back (default: 30) 1724 - 1725 - Returns: 1726 - List of detected entity dictionaries with aggregation data: 1727 - - type: Entity type 1728 - - name: Entity name 1729 - - description: Description from most recent detection 1730 - - count: Number of days entity was detected 1731 - - last_seen: Most recent day (YYYYMMDD) entity was detected 1732 - 1733 - Entities are excluded if they match an attached entity via fuzzy matching. 1734 - 1735 - Example: 1736 - >>> load_detected_entities_recent("personal", days=30) 1737 - [{"type": "Person", "name": "Charlie", "description": "Met at coffee shop", 1738 - "count": 3, "last_seen": "20250115"}] 1739 - """ 1740 - from datetime import datetime, timedelta 1741 - 1742 - journal = get_journal() 1743 - 1744 - # Load attached entities (excluding detached) for fuzzy matching 1745 - # Detached entities should appear in detected list again 1746 - attached = load_entities(facet, include_detached=False) 1747 - 1748 - # Cache for already-checked names to avoid repeated fuzzy matching 1749 - # Maps detected name -> True (excluded) or False (not excluded) 1750 - exclusion_cache: dict[str, bool] = {} 1751 - 1752 - def is_excluded(name: str) -> bool: 1753 - """Check if a detected name matches any attached entity.""" 1754 - if name in exclusion_cache: 1755 - return exclusion_cache[name] 1756 - match = find_matching_attached_entity(name, attached) 1757 - excluded = match is not None 1758 - exclusion_cache[name] = excluded 1759 - return excluded 1760 - 1761 - # Calculate date range cutoff 1762 - cutoff_date = datetime.now() - timedelta(days=days) 1763 - cutoff_str = cutoff_date.strftime("%Y%m%d") 1764 - 1765 - # Get entities directory and find all day files 1766 - entities_dir = Path(journal) / "facets" / facet / "entities" 1767 - if not entities_dir.exists(): 1768 - return [] 1769 - 1770 - # Glob day files and sort descending (newest first) 1771 - day_files = sorted(entities_dir.glob("*.jsonl"), reverse=True) 1772 - 1773 - # Aggregate entities by (type, name) 1774 - # Key: (type, name) -> {entity data with count, last_seen} 1775 - detected_map: dict[tuple[str, str], dict[str, Any]] = {} 1776 - 1777 - for day_file in day_files: 1778 - day = day_file.stem # YYYYMMDD 1779 - 1780 - # Skip files outside date range 1781 - if day < cutoff_str: 1782 - continue 1783 - 1784 - # Parse entities from this day 1785 - day_entities = parse_entity_file(str(day_file)) 1786 - 1787 - for entity in day_entities: 1788 - etype = entity.get("type", "") 1789 - name = entity.get("name", "") 1790 - 1791 - # Skip if matches attached entity (using fuzzy matching) 1792 - if is_excluded(name): 1793 - continue 1794 - 1795 - key = (etype, name) 1796 - 1797 - if key not in detected_map: 1798 - # First occurrence (most recent day) - store full entity 1799 - detected_map[key] = { 1800 - "type": etype, 1801 - "name": name, 1802 - "description": entity.get("description", ""), 1803 - "count": 1, 1804 - "last_seen": day, 1805 - } 1806 - else: 1807 - # Subsequent occurrence - just increment count 1808 - detected_map[key]["count"] += 1 1809 - 1810 - return list(detected_map.values()) 1811 - 1812 - 1813 - def format_entities( 1814 - entries: list[dict], 1815 - context: dict | None = None, 1816 - ) -> tuple[list[dict], dict]: 1817 - """Format entity JSONL entries to markdown chunks. 1818 - 1819 - This is the formatter function used by the formatters registry. 1820 - Works for both attached entities (facets/*/entities.jsonl) and 1821 - detected entities (facets/*/entities/*.jsonl). 1822 - 1823 - Args: 1824 - entries: Raw JSONL entries (one entity per line) 1825 - context: Optional context with: 1826 - - file_path: Path to JSONL file (for extracting facet name and type) 1827 - 1828 - Returns: 1829 - Tuple of (chunks, meta) where: 1830 - - chunks: List of dicts with keys: 1831 - - timestamp: int (unix ms) 1832 - - markdown: str 1833 - - source: dict (original entity entry) 1834 - - meta: Dict with optional "header" and "error" keys 1835 - """ 1836 - from datetime import datetime 1837 - 1838 - ctx = context or {} 1839 - file_path = ctx.get("file_path") 1840 - meta: dict[str, Any] = {} 1841 - chunks: list[dict[str, Any]] = [] 1842 - 1843 - # Determine if attached or detected, extract facet name and day 1844 - facet_name = "unknown" 1845 - is_detected = False 1846 - day_str: str | None = None 1847 - file_mtime_ms = 0 1848 - 1849 - if file_path: 1850 - file_path = Path(file_path) 1851 - 1852 - # Get file modification time as fallback timestamp (in milliseconds) 1853 - try: 1854 - file_mtime_ms = int(file_path.stat().st_mtime * 1000) 1855 - except (OSError, ValueError): 1856 - pass 1857 - 1858 - # Extract facet name from path 1859 - # Pattern: facets/{facet}/entities.jsonl or facets/{facet}/entities/{day}.jsonl 1860 - path_str = str(file_path) 1861 - facet_match = re.search(r"facets/([^/]+)/entities", path_str) 1862 - if facet_match: 1863 - facet_name = facet_match.group(1) 1864 - 1865 - # Check if detected (has day in filename) 1866 - if file_path.parent.name == "entities" and file_path.stem.isdigit(): 1867 - is_detected = True 1868 - day_str = file_path.stem 1869 - 1870 - # Build header 1871 - if is_detected and day_str: 1872 - # Format day as YYYY-MM-DD for readability 1873 - formatted_day = f"{day_str[:4]}-{day_str[4:6]}-{day_str[6:8]}" 1874 - header_title = f"# Detected Entities: {facet_name} ({formatted_day})\n" 1875 - else: 1876 - header_title = f"# Attached Entities: {facet_name}\n" 1877 - 1878 - entity_count = len(entries) 1879 - meta["header"] = f"{header_title}\n{entity_count} entities" 1880 - 1881 - # Calculate base timestamp for detected entities (midnight of that day) 1882 - detected_base_ts = 0 1883 - if is_detected and day_str: 1884 - try: 1885 - dt = datetime.strptime(day_str, "%Y%m%d") 1886 - detected_base_ts = int(dt.timestamp() * 1000) 1887 - except ValueError: 1888 - pass 1889 - 1890 - # Format each entity as a chunk 1891 - for entity in entries: 1892 - etype = entity.get("type", "Unknown") 1893 - name = entity.get("name", "Unnamed") 1894 - description = entity.get("description", "") 1895 - 1896 - # Determine timestamp 1897 - if is_detected: 1898 - ts = detected_base_ts 1899 - else: 1900 - # Attached: use activity timestamp (full fallback chain) 1901 - ts = entity_last_active_ts(entity) 1902 - 1903 - # Build markdown for this entity 1904 - lines = [ 1905 - f"### {etype}: {name}\n", 1906 - "", 1907 - ] 1908 - 1909 - # Description or placeholder 1910 - if description: 1911 - lines.append(description) 1912 - else: 1913 - lines.append("*(No description available)*") 1914 - lines.append("") 1915 - 1916 - # Additional fields (skip core fields, timestamp fields, id, and detached flag) 1917 - skip_fields = { 1918 - "id", 1919 - "type", 1920 - "name", 1921 - "description", 1922 - "updated_at", 1923 - "attached_at", 1924 - "last_seen", 1925 - "detached", 1926 - } 1927 - 1928 - # Handle tags specially 1929 - tags = entity.get("tags") 1930 - if tags and isinstance(tags, list): 1931 - lines.append(f"**Tags:** {', '.join(tags)}") 1932 - 1933 - # Handle aka specially 1934 - aka = entity.get("aka") 1935 - if aka and isinstance(aka, list): 1936 - lines.append(f"**Also known as:** {', '.join(aka)}") 1937 - 1938 - # Other custom fields 1939 - for key, value in entity.items(): 1940 - if key in skip_fields or key in ("tags", "aka"): 1941 - continue 1942 - # Format value appropriately 1943 - if isinstance(value, list): 1944 - value_str = ", ".join(str(v) for v in value) 1945 - else: 1946 - value_str = str(value) 1947 - # Capitalize first letter of key for display 1948 - display_key = key.replace("_", " ").title() 1949 - lines.append(f"**{display_key}:** {value_str}") 1950 - 1951 - lines.append("") 1952 - 1953 - chunks.append( 1954 - { 1955 - "timestamp": ts, 1956 - "markdown": "\n".join(lines), 1957 - "source": entity, 1958 - } 1959 - ) 1960 - 1961 - # Indexer metadata - topic depends on attached vs detected 1962 - topic = "entity:detected" if is_detected else "entity:attached" 1963 - meta["indexer"] = {"topic": topic} 1964 - 1965 - return chunks, meta 1966 - 1967 - 1968 - # ----------------------------------------------------------------------------- 1969 - # Entity Observations 1970 - # ----------------------------------------------------------------------------- 1971 - 1972 - 1973 - class ObservationNumberError(Exception): 1974 - """Raised when observation_number doesn't match expected value.""" 1975 - 1976 - def __init__(self, expected: int, actual: int): 1977 - self.expected = expected 1978 - self.actual = actual 1979 - super().__init__( 1980 - f"Observation number mismatch: expected {expected}, got {actual}" 1981 - ) 1982 - 1983 - 1984 - def observations_file_path(facet: str, name: str) -> Path: 1985 - """Return path to observations file for an entity. 1986 - 1987 - Observations are stored in the entity's memory folder: 1988 - facets/{facet}/entities/{entity_slug}/observations.jsonl 1989 - 1990 - Args: 1991 - facet: Facet name (e.g., "personal", "work") 1992 - name: Entity name (will be slugified) 1993 - 1994 - Returns: 1995 - Path to observations.jsonl file 1996 - 1997 - Raises: 1998 - ValueError: If name slugifies to empty string 1999 - """ 2000 - folder = entity_memory_path(facet, name) 2001 - return folder / "observations.jsonl" 2002 - 2003 - 2004 - def load_observations(facet: str, name: str) -> list[dict[str, Any]]: 2005 - """Load observations for an entity. 2006 - 2007 - Args: 2008 - facet: Facet name 2009 - name: Entity name 2010 - 2011 - Returns: 2012 - List of observation dictionaries with content, observed_at, source_day keys. 2013 - Returns empty list if file doesn't exist. 2014 - 2015 - Example: 2016 - >>> load_observations("work", "Alice Johnson") 2017 - [{"content": "Prefers async communication", "observed_at": 1736784000000, "source_day": "20250113"}] 2018 - """ 2019 - path = observations_file_path(facet, name) 2020 - 2021 - if not path.exists(): 2022 - return [] 2023 - 2024 - observations = [] 2025 - with open(path, "r", encoding="utf-8") as f: 2026 - for line in f: 2027 - line = line.strip() 2028 - if not line: 2029 - continue 2030 - try: 2031 - data = json.loads(line) 2032 - observations.append(data) 2033 - except json.JSONDecodeError: 2034 - continue # Skip malformed lines 2035 - 2036 - return observations 2037 - 2038 - 2039 - def save_observations( 2040 - facet: str, name: str, observations: list[dict[str, Any]] 2041 - ) -> None: 2042 - """Save observations to entity's observations file using atomic write. 2043 - 2044 - Args: 2045 - facet: Facet name 2046 - name: Entity name 2047 - observations: List of observation dictionaries 2048 - """ 2049 - path = observations_file_path(facet, name) 2050 - 2051 - # Create parent directory (entity memory folder) if needed 2052 - path.parent.mkdir(parents=True, exist_ok=True) 2053 - 2054 - # Format observations as JSONL 2055 - lines = [] 2056 - for obs in observations: 2057 - lines.append(json.dumps(obs, ensure_ascii=False) + "\n") 2058 - 2059 - # Atomic write using temp file + rename 2060 - fd, temp_path = tempfile.mkstemp( 2061 - dir=path.parent, prefix=".observations_", suffix=".tmp" 2062 - ) 2063 - try: 2064 - with os.fdopen(fd, "w", encoding="utf-8") as f: 2065 - f.writelines(lines) 2066 - os.replace(temp_path, path) 2067 - except Exception: 2068 - # Clean up temp file on error 2069 - try: 2070 - os.unlink(temp_path) 2071 - except Exception: 2072 - pass 2073 - raise 2074 - 2075 - 2076 - def add_observation( 2077 - facet: str, 2078 - name: str, 2079 - content: str, 2080 - observation_number: int, 2081 - source_day: str | None = None, 2082 - ) -> dict[str, Any]: 2083 - """Add an observation to an entity with guard validation. 2084 - 2085 - Similar to todo_add, requires the caller to provide the expected next 2086 - observation number (current count + 1) to prevent stale writes. 2087 - 2088 - Args: 2089 - facet: Facet name 2090 - name: Entity name 2091 - content: The observation text 2092 - observation_number: Expected next number; must be current_count + 1 2093 - source_day: Optional day (YYYYMMDD) when observation was made 2094 - 2095 - Returns: 2096 - Dictionary with updated observations list and count 2097 - 2098 - Raises: 2099 - ObservationNumberError: If observation_number doesn't match expected 2100 - ValueError: If content is empty 2101 - 2102 - Example: 2103 - >>> add_observation("work", "Alice", "Prefers morning meetings", 1, "20250113") 2104 - {"observations": [...], "count": 1} 2105 - """ 2106 - content = content.strip() 2107 - if not content: 2108 - raise ValueError("Observation content cannot be empty") 2109 - 2110 - observations = load_observations(facet, name) 2111 - expected = len(observations) + 1 2112 - 2113 - if observation_number != expected: 2114 - raise ObservationNumberError(expected, observation_number) 2115 - 2116 - # Create new observation 2117 - observation = { 2118 - "content": content, 2119 - "observed_at": int(time.time() * 1000), 2120 - } 2121 - if source_day: 2122 - observation["source_day"] = source_day 2123 - 2124 - observations.append(observation) 2125 - save_observations(facet, name, observations) 2126 - 2127 - return {"observations": observations, "count": len(observations)} 2128 - 2129 - 2130 - # ----------------------------------------------------------------------------- 2131 - # Journal Entity Management (Block/Delete) 2132 - # ----------------------------------------------------------------------------- 2133 - 2134 - 2135 - def block_journal_entity(entity_id: str) -> dict[str, Any]: 2136 - """Block a journal entity and detach all facet relationships. 2137 - 2138 - Sets `blocked: true` on the journal entity and `detached: true` on all 2139 - facet relationships. This is a soft disable that hides the entity from 2140 - active use while preserving all data. 2141 - 2142 - Args: 2143 - entity_id: Entity ID (slug) 2144 - 2145 - Returns: 2146 - Dict with: 2147 - - success: True if blocked 2148 - - facets_detached: List of facet names where relationships were detached 2149 - 2150 - Raises: 2151 - ValueError: If entity not found or is the principal entity 2152 - """ 2153 - journal_entity = load_journal_entity(entity_id) 2154 - if not journal_entity: 2155 - raise ValueError(f"Entity '{entity_id}' not found") 2156 - 2157 - if journal_entity.get("is_principal"): 2158 - raise ValueError("Cannot block the principal (self) entity") 2159 - 2160 - # Set blocked flag on journal entity 2161 - journal_entity["blocked"] = True 2162 - journal_entity["updated_at"] = int(time.time() * 1000) 2163 - save_journal_entity(journal_entity) 2164 - 2165 - # Detach all facet relationships 2166 - facets_detached = [] 2167 - facets_dir = Path(get_journal()) / "facets" 2168 - if facets_dir.exists(): 2169 - for facet_path in facets_dir.iterdir(): 2170 - if not facet_path.is_dir(): 2171 - continue 2172 - facet_name = facet_path.name 2173 - 2174 - relationship = load_facet_relationship(facet_name, entity_id) 2175 - if relationship and not relationship.get("detached"): 2176 - relationship["detached"] = True 2177 - relationship["updated_at"] = int(time.time() * 1000) 2178 - save_facet_relationship(facet_name, entity_id, relationship) 2179 - facets_detached.append(facet_name) 2180 - 2181 - return {"success": True, "facets_detached": facets_detached} 2182 - 2183 - 2184 - def unblock_journal_entity(entity_id: str) -> dict[str, Any]: 2185 - """Unblock a journal entity. 2186 - 2187 - Clears the `blocked` flag on the journal entity. Does NOT automatically 2188 - reattach facet relationships - the user must do that manually per-facet. 2189 - 2190 - Args: 2191 - entity_id: Entity ID (slug) 2192 - 2193 - Returns: 2194 - Dict with: 2195 - - success: True if unblocked 2196 - 2197 - Raises: 2198 - ValueError: If entity not found or not blocked 2199 - """ 2200 - journal_entity = load_journal_entity(entity_id) 2201 - if not journal_entity: 2202 - raise ValueError(f"Entity '{entity_id}' not found") 2203 - 2204 - if not journal_entity.get("blocked"): 2205 - raise ValueError(f"Entity '{entity_id}' is not blocked") 2206 - 2207 - # Clear blocked flag 2208 - journal_entity.pop("blocked", None) 2209 - journal_entity["updated_at"] = int(time.time() * 1000) 2210 - save_journal_entity(journal_entity) 2211 - 2212 - return {"success": True} 2213 - 2214 - 2215 - def delete_journal_entity(entity_id: str) -> dict[str, Any]: 2216 - """Permanently delete a journal entity and all facet relationships. 2217 - 2218 - This is a destructive operation that removes: 2219 - - The journal entity directory (entities/<id>/) 2220 - - All facet relationship directories (facets/*/entities/<id>/) 2221 - - All entity memory (voiceprints, observations) in those directories 2222 - 2223 - Args: 2224 - entity_id: Entity ID (slug) 2225 - 2226 - Returns: 2227 - Dict with: 2228 - - success: True if deleted 2229 - - facets_deleted: List of facet names where relationships were deleted 2230 - 2231 - Raises: 2232 - ValueError: If entity not found or is the principal entity 2233 - """ 2234 - journal_entity = load_journal_entity(entity_id) 2235 - if not journal_entity: 2236 - raise ValueError(f"Entity '{entity_id}' not found") 2237 - 2238 - if journal_entity.get("is_principal"): 2239 - raise ValueError("Cannot delete the principal (self) entity") 2240 - 2241 - facets_deleted = [] 2242 - 2243 - # Delete all facet relationship directories 2244 - facets_dir = Path(get_journal()) / "facets" 2245 - if facets_dir.exists(): 2246 - for facet_path in facets_dir.iterdir(): 2247 - if not facet_path.is_dir(): 2248 - continue 2249 - facet_name = facet_path.name 2250 - 2251 - # Check for relationship directory (contains entity.json and memory) 2252 - rel_dir = facet_path / "entities" / entity_id 2253 - if rel_dir.exists() and rel_dir.is_dir(): 2254 - shutil.rmtree(rel_dir) 2255 - facets_deleted.append(facet_name) 2256 - 2257 - # Delete journal entity directory 2258 - journal_dir = Path(get_journal()) / "entities" / entity_id 2259 - if journal_dir.exists() and journal_dir.is_dir(): 2260 - shutil.rmtree(journal_dir) 2261 - 2262 - return {"success": True, "facets_deleted": facets_deleted}

+162

think/entities/__init__.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Entity management with journal-wide identity and facet-scoped relationships. 5 + 6 + Entity System Architecture: 7 + - Journal-level entities: entities/<id>/entity.json - canonical identity (name, type, aka) 8 + - Facet relationships: facets/<facet>/entities/<id>/entity.json - per-facet data 9 + - Detected entities: facets/<facet>/entities/<day>.jsonl - ephemeral daily discoveries 10 + - Entity memory: facets/<facet>/entities/<id>/ - voiceprints, observations (per-facet) 11 + 12 + This package is organized into focused modules: 13 + - core: Types, constants, validation, slug generation 14 + - journal: Journal-level entity CRUD 15 + - relationships: Facet relationships and entity memory 16 + - loading: Entity loading functions 17 + - saving: Entity saving functions 18 + - matching: Entity resolution and fuzzy matching 19 + - activity: Activity tracking and detected entities 20 + - observations: Observation CRUD 21 + - formatting: Indexer formatting 22 + """ 23 + 24 + # Core types and utilities 25 + from think.entities.core import ( 26 + DEFAULT_ACTIVITY_TS, 27 + ENTITY_TYPES, 28 + MAX_ENTITY_SLUG_LENGTH, 29 + EntityDict, 30 + atomic_write, 31 + entity_last_active_ts, 32 + entity_slug, 33 + get_identity_names, 34 + is_valid_entity_type, 35 + ) 36 + 37 + # Journal-level entity management 38 + from think.entities.journal import ( 39 + block_journal_entity, 40 + delete_journal_entity, 41 + get_or_create_journal_entity, 42 + has_journal_principal, 43 + journal_entity_path, 44 + load_all_journal_entities, 45 + load_journal_entity, 46 + save_journal_entity, 47 + scan_journal_entities, 48 + unblock_journal_entity, 49 + ) 50 + 51 + # Facet relationships and memory 52 + from think.entities.relationships import ( 53 + ensure_entity_memory, 54 + entity_memory_path, 55 + facet_relationship_path, 56 + load_facet_relationship, 57 + rename_entity_memory, 58 + save_facet_relationship, 59 + scan_facet_relationships, 60 + ) 61 + 62 + # Entity loading 63 + from think.entities.loading import ( 64 + detected_entities_path, 65 + load_all_attached_entities, 66 + load_entities, 67 + load_entity_names, 68 + load_recent_entity_names, 69 + parse_entity_file, 70 + ) 71 + 72 + # Entity saving 73 + from think.entities.saving import ( 74 + save_entities, 75 + update_entity_description, 76 + ) 77 + 78 + # Entity matching and resolution 79 + from think.entities.matching import ( 80 + find_matching_attached_entity, 81 + resolve_entity, 82 + validate_aka_uniqueness, 83 + ) 84 + 85 + # Activity tracking 86 + from think.entities.activity import ( 87 + load_detected_entities_recent, 88 + parse_knowledge_graph_entities, 89 + touch_entities_from_activity, 90 + touch_entity, 91 + ) 92 + 93 + # Observations 94 + from think.entities.observations import ( 95 + ObservationNumberError, 96 + add_observation, 97 + load_observations, 98 + observations_file_path, 99 + save_observations, 100 + ) 101 + 102 + # Formatting (for indexer) 103 + from think.entities.formatting import format_entities 104 + 105 + __all__ = [ 106 + # Core 107 + "DEFAULT_ACTIVITY_TS", 108 + "ENTITY_TYPES", 109 + "MAX_ENTITY_SLUG_LENGTH", 110 + "EntityDict", 111 + "atomic_write", 112 + "entity_last_active_ts", 113 + "entity_slug", 114 + "get_identity_names", 115 + "is_valid_entity_type", 116 + # Journal 117 + "block_journal_entity", 118 + "delete_journal_entity", 119 + "get_or_create_journal_entity", 120 + "has_journal_principal", 121 + "journal_entity_path", 122 + "load_all_journal_entities", 123 + "load_journal_entity", 124 + "save_journal_entity", 125 + "scan_journal_entities", 126 + "unblock_journal_entity", 127 + # Relationships 128 + "ensure_entity_memory", 129 + "entity_memory_path", 130 + "facet_relationship_path", 131 + "load_facet_relationship", 132 + "rename_entity_memory", 133 + "save_facet_relationship", 134 + "scan_facet_relationships", 135 + # Loading 136 + "detected_entities_path", 137 + "load_all_attached_entities", 138 + "load_entities", 139 + "load_entity_names", 140 + "load_recent_entity_names", 141 + "parse_entity_file", 142 + # Saving 143 + "save_entities", 144 + "update_entity_description", 145 + # Matching 146 + "find_matching_attached_entity", 147 + "resolve_entity", 148 + "validate_aka_uniqueness", 149 + # Activity 150 + "load_detected_entities_recent", 151 + "parse_knowledge_graph_entities", 152 + "touch_entities_from_activity", 153 + "touch_entity", 154 + # Observations 155 + "ObservationNumberError", 156 + "add_observation", 157 + "load_observations", 158 + "observations_file_path", 159 + "save_observations", 160 + # Formatting 161 + "format_entities", 162 + ]

+279

think/entities/activity.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Entity activity tracking and detected entity management. 5 + 6 + This module handles: 7 + - Updating last_seen timestamps on entities 8 + - Parsing knowledge graph for entity names 9 + - Loading detected entities with aggregation 10 + """ 11 + 12 + import re 13 + from datetime import datetime, timedelta 14 + from pathlib import Path 15 + from typing import Any 16 + 17 + from think.entities.core import EntityDict 18 + from think.entities.loading import load_entities, parse_entity_file 19 + from think.entities.matching import find_matching_attached_entity 20 + from think.entities.saving import save_entities 21 + from think.utils import get_journal 22 + 23 + 24 + def touch_entity(facet: str, name: str, day: str) -> str: 25 + """Update last_seen timestamp on an attached entity. 26 + 27 + Sets the last_seen field to the provided day if the entity exists 28 + and either has no last_seen or the new day is more recent. 29 + 30 + Args: 31 + facet: Facet name 32 + name: Exact name of the attached entity to touch 33 + day: Day string in YYYYMMDD format 34 + 35 + Returns: 36 + "updated" if entity was found and last_seen was updated, 37 + "skipped" if entity was found but day is not more recent, 38 + "not_found" if entity was not found 39 + 40 + Example: 41 + >>> touch_entity("work", "Alice Johnson", "20250115") 42 + "updated" 43 + """ 44 + # Load ALL attached entities including detached to avoid data loss on save 45 + entities = load_entities(facet, day=None, include_detached=True) 46 + 47 + for entity in entities: 48 + # Skip detached entities 49 + if entity.get("detached"): 50 + continue 51 + if entity.get("name") == name: 52 + current_last_seen = entity.get("last_seen", "") 53 + # Only update if new day is more recent (or no existing last_seen) 54 + if not current_last_seen or day > current_last_seen: 55 + entity["last_seen"] = day 56 + save_entities(facet, entities, day=None) 57 + return "updated" 58 + # Entity found but day is not more recent 59 + return "skipped" 60 + 61 + return "not_found" 62 + 63 + 64 + def parse_knowledge_graph_entities(day: str) -> list[str]: 65 + """Parse entity names from a day's knowledge graph. 66 + 67 + Extracts entity names from markdown tables in the knowledge graph insight. 68 + Entity names appear in bold (**Name**) in the first column of tables. 69 + 70 + Args: 71 + day: Day string in YYYYMMDD format 72 + 73 + Returns: 74 + List of unique entity names found in the knowledge graph. 75 + Returns empty list if KG doesn't exist or can't be parsed. 76 + 77 + Example: 78 + >>> parse_knowledge_graph_entities("20260108") 79 + ["Jeremie Miller (Jer)", "Neal Satterfield", "Flightline", ...] 80 + """ 81 + journal = get_journal() 82 + kg_path = Path(journal) / day / "insights" / "knowledge_graph.md" 83 + 84 + if not kg_path.exists(): 85 + return [] 86 + 87 + try: 88 + content = kg_path.read_text(encoding="utf-8") 89 + except (OSError, UnicodeDecodeError): 90 + return [] 91 + 92 + # Extract bold names from first column of markdown tables 93 + # Pattern matches: | **Name** | ... (first column of table rows) 94 + # Also matches relationship mapping tables: | **Name** | **Target** | ... 95 + entity_names: set[str] = set() 96 + 97 + # Match table rows with bold text in first or second column 98 + # Format: | **Entity Name** | Type | ... or | **Source** | **Target** | ... 99 + table_row_pattern = re.compile(r"^\|\s*\*\*(.+?)\*\*\s*\|", re.MULTILINE) 100 + 101 + for match in table_row_pattern.finditer(content): 102 + name = match.group(1).strip() 103 + if name: 104 + entity_names.add(name) 105 + 106 + # Also extract targets from relationship mapping (second column) 107 + # Format: | **Source** | **Target** | Relationship | ... 108 + relationship_pattern = re.compile( 109 + r"^\|\s*\*\*.+?\*\*\s*\|\s*\*\*(.+?)\*\*\s*\|", re.MULTILINE 110 + ) 111 + 112 + for match in relationship_pattern.finditer(content): 113 + name = match.group(1).strip() 114 + if name: 115 + entity_names.add(name) 116 + 117 + return list(entity_names) 118 + 119 + 120 + def touch_entities_from_activity( 121 + facet: str, names: list[str], day: str 122 + ) -> dict[str, Any]: 123 + """Update last_seen for attached entities matching activity names. 124 + 125 + For each name in the activity list, attempts to find a matching 126 + attached entity using fuzzy matching and updates its last_seen field. 127 + 128 + Args: 129 + facet: Facet name 130 + names: List of entity names from activity (e.g., knowledge graph) 131 + day: Day string in YYYYMMDD format 132 + 133 + Returns: 134 + Summary dict with: 135 + - matched: List of (activity_name, attached_name) tuples for matches found 136 + - updated: List of attached entity names that were updated 137 + - skipped: List of attached entity names already up-to-date 138 + 139 + Example: 140 + >>> touch_entities_from_activity("work", ["Bob", "FAA"], "20260108") 141 + {"matched": [("Bob", "Robert Johnson"), ("FAA", "Federal Aviation Administration")], 142 + "updated": ["Robert Johnson", "Federal Aviation Administration"], 143 + "skipped": []} 144 + """ 145 + if not names: 146 + return {"matched": [], "updated": [], "skipped": []} 147 + 148 + # Load attached entities (excluding detached) 149 + attached = load_entities(facet, day=None, include_detached=False) 150 + if not attached: 151 + return {"matched": [], "updated": [], "skipped": []} 152 + 153 + # Track matches and which entities need updating 154 + matched: list[tuple[str, str]] = [] 155 + needs_update: dict[str, str] = {} # attached_name -> most_recent_day 156 + 157 + for activity_name in names: 158 + entity = find_matching_attached_entity(activity_name, attached) 159 + if entity: 160 + attached_name = entity.get("name", "") 161 + if attached_name: 162 + matched.append((activity_name, attached_name)) 163 + # Track the day for this entity (may be touched multiple times) 164 + current = needs_update.get(attached_name, "") 165 + if not current or day > current: 166 + needs_update[attached_name] = day 167 + 168 + # Now batch the updates 169 + updated: list[str] = [] 170 + skipped: list[str] = [] 171 + 172 + for attached_name, update_day in needs_update.items(): 173 + result = touch_entity(facet, attached_name, update_day) 174 + if result == "updated": 175 + updated.append(attached_name) 176 + else: 177 + # "skipped" (already up-to-date) or "not_found" 178 + skipped.append(attached_name) 179 + 180 + return {"matched": matched, "updated": updated, "skipped": skipped} 181 + 182 + 183 + def load_detected_entities_recent(facet: str, days: int = 30) -> list[EntityDict]: 184 + """Load detected entities from last N days, excluding those matching attached entities. 185 + 186 + Scans detected entity files in reverse chronological order (newest first), 187 + aggregating by (type, name) to provide count and last_seen tracking. 188 + 189 + Uses fuzzy matching to exclude detected entities that match attached entities 190 + by name, aka, normalized form, first word, or fuzzy similarity. 191 + 192 + Args: 193 + facet: Facet name 194 + days: Number of days to look back (default: 30) 195 + 196 + Returns: 197 + List of detected entity dictionaries with aggregation data: 198 + - type: Entity type 199 + - name: Entity name 200 + - description: Description from most recent detection 201 + - count: Number of days entity was detected 202 + - last_seen: Most recent day (YYYYMMDD) entity was detected 203 + 204 + Entities are excluded if they match an attached entity via fuzzy matching. 205 + 206 + Example: 207 + >>> load_detected_entities_recent("personal", days=30) 208 + [{"type": "Person", "name": "Charlie", "description": "Met at coffee shop", 209 + "count": 3, "last_seen": "20250115"}] 210 + """ 211 + journal = get_journal() 212 + 213 + # Load attached entities (excluding detached) for fuzzy matching 214 + # Detached entities should appear in detected list again 215 + attached = load_entities(facet, include_detached=False) 216 + 217 + # Cache for already-checked names to avoid repeated fuzzy matching 218 + # Maps detected name -> True (excluded) or False (not excluded) 219 + exclusion_cache: dict[str, bool] = {} 220 + 221 + def is_excluded(name: str) -> bool: 222 + """Check if a detected name matches any attached entity.""" 223 + if name in exclusion_cache: 224 + return exclusion_cache[name] 225 + match = find_matching_attached_entity(name, attached) 226 + excluded = match is not None 227 + exclusion_cache[name] = excluded 228 + return excluded 229 + 230 + # Calculate date range cutoff 231 + cutoff_date = datetime.now() - timedelta(days=days) 232 + cutoff_str = cutoff_date.strftime("%Y%m%d") 233 + 234 + # Get entities directory and find all day files 235 + entities_dir = Path(journal) / "facets" / facet / "entities" 236 + if not entities_dir.exists(): 237 + return [] 238 + 239 + # Glob day files and sort descending (newest first) 240 + day_files = sorted(entities_dir.glob("*.jsonl"), reverse=True) 241 + 242 + # Aggregate entities by (type, name) 243 + # Key: (type, name) -> {entity data with count, last_seen} 244 + detected_map: dict[tuple[str, str], EntityDict] = {} 245 + 246 + for day_file in day_files: 247 + day = day_file.stem # YYYYMMDD 248 + 249 + # Skip files outside date range 250 + if day < cutoff_str: 251 + continue 252 + 253 + # Parse entities from this day 254 + day_entities = parse_entity_file(str(day_file)) 255 + 256 + for entity in day_entities: 257 + etype = entity.get("type", "") 258 + name = entity.get("name", "") 259 + 260 + # Skip if matches attached entity (using fuzzy matching) 261 + if is_excluded(name): 262 + continue 263 + 264 + key = (etype, name) 265 + 266 + if key not in detected_map: 267 + # First occurrence (most recent day) - store full entity 268 + detected_map[key] = { 269 + "type": etype, 270 + "name": name, 271 + "description": entity.get("description", ""), 272 + "count": 1, 273 + "last_seen": day, 274 + } 275 + else: 276 + # Subsequent occurrence - just increment count 277 + detected_map[key]["count"] += 1 278 + 279 + return list(detected_map.values())

+212

think/entities/core.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Core entity types, constants, and validation utilities. 5 + 6 + This module provides the foundational types and functions used throughout 7 + the entity system: 8 + - Type aliases for better documentation 9 + - Constants for default values and validation 10 + - Identity and slug generation 11 + - Type validation 12 + """ 13 + 14 + import hashlib 15 + import os 16 + import re 17 + import tempfile 18 + from datetime import datetime 19 + from pathlib import Path 20 + from typing import Any 21 + 22 + from slugify import slugify 23 + 24 + from think.utils import get_config 25 + 26 + # Type alias for entity dictionaries 27 + EntityDict = dict[str, Any] 28 + 29 + # Default timestamp for entities without activity data (Jan 1 2026 00:00:00 UTC) 30 + # Used as fallback in entity_last_active_ts() to ensure all entities have a sortable value 31 + DEFAULT_ACTIVITY_TS = 1767225600000 32 + 33 + # Standard entity types - used for UI suggestions and documentation. 34 + # Custom types are still allowed (validated by is_valid_entity_type regex). 35 + ENTITY_TYPES = [ 36 + {"name": "Person"}, 37 + {"name": "Company"}, 38 + {"name": "Project"}, 39 + {"name": "Tool"}, 40 + ] 41 + 42 + # Maximum length for entity slug before truncation 43 + MAX_ENTITY_SLUG_LENGTH = 200 44 + 45 + 46 + def get_identity_names() -> list[str]: 47 + """Get all names/aliases for the journal principal from identity config. 48 + 49 + Returns a list of names to match against entities, in display priority order: 50 + 1. identity.preferred (nickname/preferred name) - best for display 51 + 2. identity.name (full name) 52 + 3. identity.aliases (list of alternative names) 53 + 54 + The first element (if any) is the best name for display purposes. 55 + Returns empty list if identity is not configured. 56 + """ 57 + config = get_config() 58 + identity = config.get("identity", {}) 59 + 60 + names: list[str] = [] 61 + 62 + # Preferred name first (best for display) 63 + preferred = identity.get("preferred", "").strip() 64 + if preferred: 65 + names.append(preferred) 66 + 67 + # Full name 68 + name = identity.get("name", "").strip() 69 + if name and name not in names: 70 + names.append(name) 71 + 72 + # Aliases 73 + aliases = identity.get("aliases", []) 74 + if isinstance(aliases, list): 75 + for alias in aliases: 76 + if isinstance(alias, str): 77 + alias = alias.strip() 78 + if alias and alias not in names: 79 + names.append(alias) 80 + 81 + return names 82 + 83 + 84 + def entity_last_active_ts(entity: EntityDict) -> int: 85 + """Get entity's last activity timestamp with fallback chain. 86 + 87 + Returns a Unix timestamp (milliseconds) representing when the entity was 88 + last active, using the following priority: 89 + 1. last_seen (YYYYMMDD string, converted to local midnight) 90 + 2. updated_at (Unix ms) 91 + 3. attached_at (Unix ms) 92 + 4. DEFAULT_ACTIVITY_TS (Jan 1 2026) 93 + 94 + This ensures all entities have a sortable timestamp value. 95 + 96 + Args: 97 + entity: Entity dictionary with optional last_seen, updated_at, attached_at fields 98 + 99 + Returns: 100 + Unix timestamp in milliseconds 101 + 102 + Examples: 103 + >>> entity_last_active_ts({"last_seen": "20260115"}) # Jan 15 2026 local midnight 104 + >>> entity_last_active_ts({"updated_at": 1700000000000}) 105 + 1700000000000 106 + >>> entity_last_active_ts({}) 107 + 1767225600000 # DEFAULT_ACTIVITY_TS (Jan 1 2026 UTC) 108 + """ 109 + # Priority 1: last_seen (YYYYMMDD string) 110 + last_seen = entity.get("last_seen") 111 + if last_seen and isinstance(last_seen, str) and len(last_seen) == 8: 112 + try: 113 + dt = datetime.strptime(last_seen, "%Y%m%d") 114 + return int(dt.timestamp() * 1000) 115 + except ValueError: 116 + pass # Malformed, fall through 117 + 118 + # Priority 2: updated_at 119 + updated_at = entity.get("updated_at") 120 + if updated_at and isinstance(updated_at, int) and updated_at > 0: 121 + return updated_at 122 + 123 + # Priority 3: attached_at 124 + attached_at = entity.get("attached_at") 125 + if attached_at and isinstance(attached_at, int) and attached_at > 0: 126 + return attached_at 127 + 128 + # Priority 4: Default 129 + return DEFAULT_ACTIVITY_TS 130 + 131 + 132 + def is_valid_entity_type(etype: str) -> bool: 133 + """Validate entity type: alphanumeric and spaces only, at least 3 characters.""" 134 + if not etype or len(etype.strip()) < 3: 135 + return False 136 + # Must contain only alphanumeric and spaces, and at least one alphanumeric character 137 + return bool( 138 + re.match(r"^[A-Za-z0-9 ]+$", etype) and re.search(r"[A-Za-z0-9]", etype) 139 + ) 140 + 141 + 142 + def entity_slug(name: str) -> str: 143 + """Generate a stable slug identifier for an entity name. 144 + 145 + The slug is used as: 146 + - The `id` field stored in entity records 147 + - Folder names for entity memory storage 148 + - URL-safe programmatic references 149 + 150 + Uses python-slugify to convert names to lowercase with underscores. 151 + Long names are truncated with a hash suffix to ensure uniqueness. 152 + 153 + Args: 154 + name: Entity name (e.g., "Alice Johnson", "Acme Corp") 155 + 156 + Returns: 157 + Slug identifier (e.g., "alice_johnson", "acme_corp") 158 + 159 + Examples: 160 + >>> entity_slug("Alice Johnson") 161 + 'alice_johnson' 162 + >>> entity_slug("O'Brien") 163 + 'o_brien' 164 + >>> entity_slug("AT&T") 165 + 'at_t' 166 + >>> entity_slug("José García") 167 + 'jose_garcia' 168 + """ 169 + if not name or not name.strip(): 170 + return "" 171 + 172 + # Use slugify with underscore separator 173 + slug = slugify(name, separator="_") 174 + 175 + # Handle very long names - truncate and add hash suffix 176 + if len(slug) > MAX_ENTITY_SLUG_LENGTH: 177 + # Create hash of full name for uniqueness 178 + name_hash = hashlib.md5(name.encode()).hexdigest()[:8] 179 + # Truncate and append hash 180 + slug = slug[: MAX_ENTITY_SLUG_LENGTH - 9] + "_" + name_hash 181 + 182 + return slug 183 + 184 + 185 + def atomic_write(path: Path, content: str, prefix: str = ".tmp_") -> None: 186 + """Write content to a file atomically using tempfile + rename. 187 + 188 + Creates a temporary file in the same directory, writes content, 189 + then atomically renames to the target path. This ensures the 190 + target file is never in a partial state. 191 + 192 + Args: 193 + path: Target file path 194 + content: String content to write 195 + prefix: Prefix for the temporary file (default: ".tmp_") 196 + 197 + Raises: 198 + OSError: If write or rename fails 199 + """ 200 + path.parent.mkdir(parents=True, exist_ok=True) 201 + 202 + fd, temp_path = tempfile.mkstemp(dir=path.parent, prefix=prefix, suffix=".tmp") 203 + try: 204 + with os.fdopen(fd, "w", encoding="utf-8") as f: 205 + f.write(content) 206 + os.replace(temp_path, path) 207 + except Exception: 208 + try: 209 + os.unlink(temp_path) 210 + except Exception: 211 + pass 212 + raise

+161

think/entities/formatting.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Entity formatting for indexer. 5 + 6 + This module provides format_entities() which is registered in the formatters 7 + registry to convert entity JSONL files into markdown chunks for indexing. 8 + """ 9 + 10 + import re 11 + from datetime import datetime 12 + from pathlib import Path 13 + from typing import Any 14 + 15 + from think.entities.core import EntityDict, entity_last_active_ts 16 + 17 + 18 + def format_entities( 19 + entries: list[EntityDict], 20 + context: dict[str, Any] | None = None, 21 + ) -> tuple[list[dict[str, Any]], dict[str, Any]]: 22 + """Format entity JSONL entries to markdown chunks. 23 + 24 + This is the formatter function used by the formatters registry. 25 + Works for both attached entities (facets/*/entities/<id>/entity.json) and 26 + detected entities (facets/*/entities/*.jsonl). 27 + 28 + Args: 29 + entries: Raw JSONL entries (one entity per line) 30 + context: Optional context with: 31 + - file_path: Path to JSONL file (for extracting facet name and type) 32 + 33 + Returns: 34 + Tuple of (chunks, meta) where: 35 + - chunks: List of dicts with keys: 36 + - timestamp: int (unix ms) 37 + - markdown: str 38 + - source: dict (original entity entry) 39 + - meta: Dict with optional "header" and "error" keys 40 + """ 41 + ctx = context or {} 42 + file_path = ctx.get("file_path") 43 + meta: dict[str, Any] = {} 44 + chunks: list[dict[str, Any]] = [] 45 + 46 + # Determine if attached or detected, extract facet name and day 47 + facet_name = "unknown" 48 + is_detected = False 49 + day_str: str | None = None 50 + 51 + if file_path: 52 + file_path = Path(file_path) 53 + 54 + # Extract facet name from path 55 + # Pattern: facets/{facet}/entities/{day}.jsonl (detected entities) 56 + path_str = str(file_path) 57 + facet_match = re.search(r"facets/([^/]+)/entities", path_str) 58 + if facet_match: 59 + facet_name = facet_match.group(1) 60 + 61 + # Check if detected (has day in filename) 62 + if file_path.parent.name == "entities" and file_path.stem.isdigit(): 63 + is_detected = True 64 + day_str = file_path.stem 65 + 66 + # Build header 67 + if is_detected and day_str: 68 + # Format day as YYYY-MM-DD for readability 69 + formatted_day = f"{day_str[:4]}-{day_str[4:6]}-{day_str[6:8]}" 70 + header_title = f"# Detected Entities: {facet_name} ({formatted_day})\n" 71 + else: 72 + header_title = f"# Attached Entities: {facet_name}\n" 73 + 74 + entity_count = len(entries) 75 + meta["header"] = f"{header_title}\n{entity_count} entities" 76 + 77 + # Calculate base timestamp for detected entities (midnight of that day) 78 + detected_base_ts = 0 79 + if is_detected and day_str: 80 + try: 81 + dt = datetime.strptime(day_str, "%Y%m%d") 82 + detected_base_ts = int(dt.timestamp() * 1000) 83 + except ValueError: 84 + pass 85 + 86 + # Format each entity as a chunk 87 + for entity in entries: 88 + etype = entity.get("type", "Unknown") 89 + name = entity.get("name", "Unnamed") 90 + description = entity.get("description", "") 91 + 92 + # Determine timestamp 93 + if is_detected: 94 + ts = detected_base_ts 95 + else: 96 + # Attached: use activity timestamp (full fallback chain) 97 + ts = entity_last_active_ts(entity) 98 + 99 + # Build markdown for this entity 100 + lines = [ 101 + f"### {etype}: {name}\n", 102 + "", 103 + ] 104 + 105 + # Description or placeholder 106 + if description: 107 + lines.append(description) 108 + else: 109 + lines.append("*(No description available)*") 110 + lines.append("") 111 + 112 + # Additional fields (skip core fields, timestamp fields, id, and detached flag) 113 + skip_fields = { 114 + "id", 115 + "type", 116 + "name", 117 + "description", 118 + "updated_at", 119 + "attached_at", 120 + "last_seen", 121 + "detached", 122 + } 123 + 124 + # Handle tags specially 125 + tags = entity.get("tags") 126 + if tags and isinstance(tags, list): 127 + lines.append(f"**Tags:** {', '.join(tags)}") 128 + 129 + # Handle aka specially 130 + aka = entity.get("aka") 131 + if aka and isinstance(aka, list): 132 + lines.append(f"**Also known as:** {', '.join(aka)}") 133 + 134 + # Other custom fields 135 + for key, value in entity.items(): 136 + if key in skip_fields or key in ("tags", "aka"): 137 + continue 138 + # Format value appropriately 139 + if isinstance(value, list): 140 + value_str = ", ".join(str(v) for v in value) 141 + else: 142 + value_str = str(value) 143 + # Capitalize first letter of key for display 144 + display_key = key.replace("_", " ").title() 145 + lines.append(f"**{display_key}:** {value_str}") 146 + 147 + lines.append("") 148 + 149 + chunks.append( 150 + { 151 + "timestamp": ts, 152 + "markdown": "\n".join(lines), 153 + "source": entity, 154 + } 155 + ) 156 + 157 + # Indexer metadata - topic depends on attached vs detected 158 + topic = "entity:detected" if is_detected else "entity:attached" 159 + meta["indexer"] = {"topic": topic} 160 + 161 + return chunks, meta

+334

think/entities/journal.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Journal-level entity management. 5 + 6 + Journal entities are the canonical identity records stored at: 7 + entities/<id>/entity.json 8 + 9 + They contain identity fields: id, name, type, aka, is_principal, created_at, blocked. 10 + Facet-specific data (description, timestamps) is stored in facet relationships. 11 + """ 12 + 13 + import json 14 + import shutil 15 + import time 16 + from pathlib import Path 17 + from typing import Any 18 + 19 + from think.entities.core import EntityDict, atomic_write, get_identity_names 20 + from think.utils import get_journal 21 + 22 + 23 + def journal_entity_path(entity_id: str) -> Path: 24 + """Return path to journal-level entity file. 25 + 26 + Args: 27 + entity_id: Entity ID (slug) 28 + 29 + Returns: 30 + Path to entities/<id>/entity.json 31 + """ 32 + return Path(get_journal()) / "entities" / entity_id / "entity.json" 33 + 34 + 35 + def load_journal_entity(entity_id: str) -> EntityDict | None: 36 + """Load a journal-level entity by ID. 37 + 38 + Args: 39 + entity_id: Entity ID (slug) 40 + 41 + Returns: 42 + Entity dict with id, name, type, aka, is_principal, created_at fields, 43 + or None if not found. 44 + """ 45 + path = journal_entity_path(entity_id) 46 + if not path.exists(): 47 + return None 48 + 49 + try: 50 + with open(path, "r", encoding="utf-8") as f: 51 + data = json.load(f) 52 + # Ensure id is present 53 + data["id"] = entity_id 54 + return data 55 + except (json.JSONDecodeError, OSError): 56 + return None 57 + 58 + 59 + def save_journal_entity(entity: EntityDict) -> None: 60 + """Save a journal-level entity using atomic write. 61 + 62 + The entity must have an 'id' field. Creates the directory if needed. 63 + 64 + Args: 65 + entity: Entity dict with id, name, type, aka (optional), is_principal (optional), 66 + created_at fields. 67 + 68 + Raises: 69 + ValueError: If entity has no id field 70 + """ 71 + entity_id = entity.get("id") 72 + if not entity_id: 73 + raise ValueError("Entity must have an 'id' field") 74 + 75 + path = journal_entity_path(entity_id) 76 + content = json.dumps(entity, ensure_ascii=False, indent=2) + "\n" 77 + atomic_write(path, content, prefix=".entity_") 78 + 79 + 80 + def scan_journal_entities() -> list[str]: 81 + """List all entity IDs from journal-level entities. 82 + 83 + Scans entities/ directory for subdirectories containing entity.json. 84 + 85 + Returns: 86 + List of entity IDs (directory names) 87 + """ 88 + entities_dir = Path(get_journal()) / "entities" 89 + if not entities_dir.exists(): 90 + return [] 91 + 92 + entity_ids = [] 93 + for entry in entities_dir.iterdir(): 94 + if entry.is_dir() and (entry / "entity.json").exists(): 95 + entity_ids.append(entry.name) 96 + 97 + return sorted(entity_ids) 98 + 99 + 100 + def load_all_journal_entities() -> dict[str, EntityDict]: 101 + """Load all journal-level entities. 102 + 103 + Returns: 104 + Dict mapping entity_id to entity dict 105 + """ 106 + entity_ids = scan_journal_entities() 107 + entities = {} 108 + for entity_id in entity_ids: 109 + entity = load_journal_entity(entity_id) 110 + if entity: 111 + entities[entity_id] = entity 112 + return entities 113 + 114 + 115 + def has_journal_principal() -> bool: 116 + """Check if any journal entity is already flagged as principal. 117 + 118 + Returns: 119 + True if a principal entity exists, False otherwise 120 + """ 121 + for entity_id in scan_journal_entities(): 122 + entity = load_journal_entity(entity_id) 123 + if entity and entity.get("is_principal"): 124 + return True 125 + return False 126 + 127 + 128 + def _should_be_principal(name: str, aka: list[str] | None) -> bool: 129 + """Check if an entity should be flagged as principal based on identity config. 130 + 131 + Args: 132 + name: Entity name 133 + aka: Optional list of aliases 134 + 135 + Returns: 136 + True if the entity matches identity config, False otherwise 137 + """ 138 + identity_names = get_identity_names() 139 + if not identity_names: 140 + return False 141 + 142 + # Check if name or any aka matches identity 143 + names_to_check = [name.lower()] 144 + if aka: 145 + names_to_check.extend(a.lower() for a in aka) 146 + 147 + for identity_name in identity_names: 148 + if identity_name.lower() in names_to_check: 149 + return True 150 + 151 + return False 152 + 153 + 154 + def get_or_create_journal_entity( 155 + entity_id: str, 156 + name: str, 157 + entity_type: str, 158 + aka: list[str] | None = None, 159 + *, 160 + skip_principal: bool = False, 161 + ) -> EntityDict: 162 + """Get existing journal entity or create new one. 163 + 164 + If entity exists, returns it unchanged (does not update fields). 165 + If entity doesn't exist, creates it with provided values. 166 + 167 + Args: 168 + entity_id: Entity ID (slug) 169 + name: Entity name 170 + entity_type: Entity type (e.g., "Person", "Company") 171 + aka: Optional list of aliases 172 + skip_principal: If True, don't flag as principal even if matches identity 173 + 174 + Returns: 175 + The existing or newly created entity dict 176 + """ 177 + existing = load_journal_entity(entity_id) 178 + if existing: 179 + return existing 180 + 181 + # Create new entity 182 + entity: EntityDict = { 183 + "id": entity_id, 184 + "name": name, 185 + "type": entity_type, 186 + "created_at": int(time.time() * 1000), 187 + } 188 + if aka: 189 + entity["aka"] = aka 190 + 191 + # Check if this should be the principal 192 + # Only flag if: matches identity, no existing principal, and not skipped 193 + if ( 194 + not skip_principal 195 + and _should_be_principal(name, aka) 196 + and not has_journal_principal() 197 + ): 198 + entity["is_principal"] = True 199 + 200 + save_journal_entity(entity) 201 + return entity 202 + 203 + 204 + def block_journal_entity(entity_id: str) -> dict[str, Any]: 205 + """Block a journal entity and detach all facet relationships. 206 + 207 + Sets `blocked: true` on the journal entity and `detached: true` on all 208 + facet relationships. This is a soft disable that hides the entity from 209 + active use while preserving all data. 210 + 211 + Args: 212 + entity_id: Entity ID (slug) 213 + 214 + Returns: 215 + Dict with: 216 + - success: True if blocked 217 + - facets_detached: List of facet names where relationships were detached 218 + 219 + Raises: 220 + ValueError: If entity not found or is the principal entity 221 + """ 222 + # Import here to avoid circular dependency 223 + from think.entities.relationships import load_facet_relationship, save_facet_relationship 224 + 225 + journal_entity = load_journal_entity(entity_id) 226 + if not journal_entity: 227 + raise ValueError(f"Entity '{entity_id}' not found") 228 + 229 + if journal_entity.get("is_principal"): 230 + raise ValueError("Cannot block the principal (self) entity") 231 + 232 + # Set blocked flag on journal entity 233 + journal_entity["blocked"] = True 234 + journal_entity["updated_at"] = int(time.time() * 1000) 235 + save_journal_entity(journal_entity) 236 + 237 + # Detach all facet relationships 238 + facets_detached = [] 239 + facets_dir = Path(get_journal()) / "facets" 240 + if facets_dir.exists(): 241 + for facet_path in facets_dir.iterdir(): 242 + if not facet_path.is_dir(): 243 + continue 244 + facet_name = facet_path.name 245 + 246 + relationship = load_facet_relationship(facet_name, entity_id) 247 + if relationship and not relationship.get("detached"): 248 + relationship["detached"] = True 249 + relationship["updated_at"] = int(time.time() * 1000) 250 + save_facet_relationship(facet_name, entity_id, relationship) 251 + facets_detached.append(facet_name) 252 + 253 + return {"success": True, "facets_detached": facets_detached} 254 + 255 + 256 + def unblock_journal_entity(entity_id: str) -> dict[str, Any]: 257 + """Unblock a journal entity. 258 + 259 + Clears the `blocked` flag on the journal entity. Does NOT automatically 260 + reattach facet relationships - the user must do that manually per-facet. 261 + 262 + Args: 263 + entity_id: Entity ID (slug) 264 + 265 + Returns: 266 + Dict with: 267 + - success: True if unblocked 268 + 269 + Raises: 270 + ValueError: If entity not found or not blocked 271 + """ 272 + journal_entity = load_journal_entity(entity_id) 273 + if not journal_entity: 274 + raise ValueError(f"Entity '{entity_id}' not found") 275 + 276 + if not journal_entity.get("blocked"): 277 + raise ValueError(f"Entity '{entity_id}' is not blocked") 278 + 279 + # Clear blocked flag 280 + journal_entity.pop("blocked", None) 281 + journal_entity["updated_at"] = int(time.time() * 1000) 282 + save_journal_entity(journal_entity) 283 + 284 + return {"success": True} 285 + 286 + 287 + def delete_journal_entity(entity_id: str) -> dict[str, Any]: 288 + """Permanently delete a journal entity and all facet relationships. 289 + 290 + This is a destructive operation that removes: 291 + - The journal entity directory (entities/<id>/) 292 + - All facet relationship directories (facets/*/entities/<id>/) 293 + - All entity memory (voiceprints, observations) in those directories 294 + 295 + Args: 296 + entity_id: Entity ID (slug) 297 + 298 + Returns: 299 + Dict with: 300 + - success: True if deleted 301 + - facets_deleted: List of facet names where relationships were deleted 302 + 303 + Raises: 304 + ValueError: If entity not found or is the principal entity 305 + """ 306 + journal_entity = load_journal_entity(entity_id) 307 + if not journal_entity: 308 + raise ValueError(f"Entity '{entity_id}' not found") 309 + 310 + if journal_entity.get("is_principal"): 311 + raise ValueError("Cannot delete the principal (self) entity") 312 + 313 + facets_deleted = [] 314 + 315 + # Delete all facet relationship directories 316 + facets_dir = Path(get_journal()) / "facets" 317 + if facets_dir.exists(): 318 + for facet_path in facets_dir.iterdir(): 319 + if not facet_path.is_dir(): 320 + continue 321 + facet_name = facet_path.name 322 + 323 + # Check for relationship directory (contains entity.json and memory) 324 + rel_dir = facet_path / "entities" / entity_id 325 + if rel_dir.exists() and rel_dir.is_dir(): 326 + shutil.rmtree(rel_dir) 327 + facets_deleted.append(facet_name) 328 + 329 + # Delete journal entity directory 330 + journal_dir = Path(get_journal()) / "entities" / entity_id 331 + if journal_dir.exists() and journal_dir.is_dir(): 332 + shutil.rmtree(journal_dir) 333 + 334 + return {"success": True, "facets_deleted": facets_deleted}

+396

think/entities/loading.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Entity loading functions. 5 + 6 + This module handles loading entities from storage: 7 + - load_entities: Load attached or detected entities for a facet 8 + - load_all_attached_entities: Load from all facets with deduplication 9 + - load_entity_names / load_recent_entity_names: For transcription context 10 + """ 11 + 12 + import json 13 + import os 14 + import re 15 + from pathlib import Path 16 + 17 + from think.entities.core import ( 18 + EntityDict, 19 + entity_last_active_ts, 20 + entity_slug, 21 + is_valid_entity_type, 22 + ) 23 + from think.entities.journal import load_all_journal_entities 24 + from think.entities.relationships import ( 25 + enrich_relationship_with_journal, 26 + load_facet_relationship, 27 + scan_facet_relationships, 28 + ) 29 + from think.utils import get_journal 30 + 31 + 32 + def detected_entities_path(facet: str, day: str) -> Path: 33 + """Return path to detected entities file for a facet and day. 34 + 35 + Args: 36 + facet: Facet name (e.g., "personal", "work") 37 + day: Day in YYYYMMDD format 38 + 39 + Returns: 40 + Path to facets/{facet}/entities/{day}.jsonl 41 + """ 42 + return Path(get_journal()) / "facets" / facet / "entities" / f"{day}.jsonl" 43 + 44 + 45 + def parse_entity_file( 46 + file_path: str, *, validate_types: bool = True 47 + ) -> list[EntityDict]: 48 + """Parse entities from a JSONL file. 49 + 50 + This is the low-level file parsing function used for detected entity files. 51 + Each line in the file should be a JSON object with type, name, and description fields. 52 + 53 + Generates `id` field (slug) for entities that don't have one. 54 + 55 + Args: 56 + file_path: Absolute path to entities JSONL file 57 + validate_types: If True, filters out invalid entity types (default: True) 58 + 59 + Returns: 60 + List of entity dictionaries with id, type, name, and description keys 61 + 62 + Example: 63 + >>> parse_entity_file("/path/to/20250101.jsonl") 64 + [{"id": "john_smith", "type": "Person", "name": "John Smith", "description": "Friend"}] 65 + """ 66 + if not os.path.isfile(file_path): 67 + return [] 68 + 69 + entities = [] 70 + with open(file_path, "r", encoding="utf-8") as f: 71 + for line in f: 72 + line = line.strip() 73 + if not line: 74 + continue 75 + try: 76 + data = json.loads(line) 77 + etype = data.get("type", "") 78 + name = data.get("name", "") 79 + desc = data.get("description", "") 80 + 81 + # Validate if requested 82 + if validate_types and not is_valid_entity_type(etype): 83 + continue 84 + 85 + # Generate id from name if not present 86 + entity_id = data.get("id") or entity_slug(name) 87 + 88 + # Preserve all fields from JSON, ensuring core fields exist 89 + # Put id first for readability in JSONL output 90 + entity: EntityDict = { 91 + "id": entity_id, 92 + "type": etype, 93 + "name": name, 94 + "description": desc, 95 + } 96 + # Add any additional fields from the JSON 97 + for key, value in data.items(): 98 + if key not in entity: 99 + entity[key] = value 100 + 101 + entities.append(entity) 102 + except (json.JSONDecodeError, AttributeError): 103 + continue # Skip malformed lines 104 + 105 + return entities 106 + 107 + 108 + def _load_entities_from_relationships( 109 + facet: str, *, include_detached: bool = False 110 + ) -> list[EntityDict]: 111 + """Load attached entities from facet relationships + journal entities. 112 + 113 + Args: 114 + facet: Facet name 115 + include_detached: If True, includes detached entities 116 + 117 + Returns: 118 + List of enriched entity dicts 119 + """ 120 + entity_ids = scan_facet_relationships(facet) 121 + if not entity_ids: 122 + return [] 123 + 124 + # Load all journal entities for enrichment 125 + journal_entities = load_all_journal_entities() 126 + 127 + entities = [] 128 + for entity_id in entity_ids: 129 + relationship = load_facet_relationship(facet, entity_id) 130 + if relationship is None: 131 + continue 132 + 133 + # Skip detached if not requested 134 + if not include_detached and relationship.get("detached"): 135 + continue 136 + 137 + # Enrich with journal entity data 138 + journal_entity = journal_entities.get(entity_id) 139 + enriched = enrich_relationship_with_journal(relationship, journal_entity) 140 + entities.append(enriched) 141 + 142 + return entities 143 + 144 + 145 + def load_entities( 146 + facet: str, day: str | None = None, *, include_detached: bool = False 147 + ) -> list[EntityDict]: 148 + """Load entities from facet. 149 + 150 + For attached entities (day=None), loads from facet relationships 151 + enriched with journal entity data. 152 + 153 + For detected entities (day provided), loads from day-specific JSONL files. 154 + 155 + Args: 156 + facet: Facet name 157 + day: Optional day in YYYYMMDD format for detected entities 158 + include_detached: If True, includes entities with detached=True. 159 + Default False excludes detached entities. 160 + Only applies to attached entities (day=None). 161 + 162 + Returns: 163 + List of entity dictionaries with id, type, name, description, and other fields. 164 + 165 + Example: 166 + >>> load_entities("personal") 167 + [{"id": "john_smith", "type": "Person", "name": "John Smith", "description": "Friend"}] 168 + """ 169 + # For detected entities, use day-specific files 170 + if day is not None: 171 + path = detected_entities_path(facet, day) 172 + return parse_entity_file(str(path)) 173 + 174 + # For attached entities, load from relationships 175 + return _load_entities_from_relationships(facet, include_detached=include_detached) 176 + 177 + 178 + def load_all_attached_entities( 179 + *, 180 + sort_by: str | None = None, 181 + limit: int | None = None, 182 + ) -> list[EntityDict]: 183 + """Load all attached entities from all facets with deduplication. 184 + 185 + Iterates facets in sorted (alphabetical) order. When the same entity 186 + ID appears in multiple facets, keeps the first occurrence. 187 + 188 + Args: 189 + sort_by: Optional field to sort by. Currently supports "last_seen" 190 + which sorts by recency (entities without the field go to end). 191 + limit: Optional maximum number of entities to return (applied after 192 + deduplication and sorting). 193 + 194 + Returns: 195 + List of entity dictionaries, deduplicated by id 196 + 197 + Example: 198 + >>> load_all_attached_entities() 199 + [{"id": "john_smith", "type": "Person", "name": "John Smith", ...}, ...] 200 + 201 + >>> load_all_attached_entities(sort_by="last_seen", limit=20) 202 + # Returns 20 most recently seen entities 203 + 204 + Note: 205 + Used for agent context loading. Provides deterministic behavior 206 + despite allowing independent entity descriptions across facets. 207 + """ 208 + facets_dir = Path(get_journal()) / "facets" 209 + if not facets_dir.exists(): 210 + return [] 211 + 212 + # Track seen IDs for deduplication (use ID instead of name for uniqueness) 213 + seen_ids: set[str] = set() 214 + all_entities: list[EntityDict] = [] 215 + 216 + # Process facets in sorted order for deterministic results 217 + for facet_path in sorted(facets_dir.iterdir()): 218 + if not facet_path.is_dir(): 219 + continue 220 + 221 + facet_name = facet_path.name 222 + 223 + for entity in load_entities(facet_name, include_detached=False): 224 + entity_id = entity.get("id", "") 225 + # Keep first occurrence only (deduplicate by ID) 226 + if entity_id and entity_id not in seen_ids: 227 + seen_ids.add(entity_id) 228 + all_entities.append(entity) 229 + 230 + # Sort if requested 231 + if sort_by == "last_seen": 232 + # Sort by activity timestamp descending (uses full fallback chain) 233 + all_entities.sort( 234 + key=entity_last_active_ts, 235 + reverse=True, 236 + ) 237 + 238 + # Apply limit if requested 239 + if limit is not None and limit > 0: 240 + all_entities = all_entities[:limit] 241 + 242 + return all_entities 243 + 244 + 245 + def _extract_spoken_names(entities: list[EntityDict]) -> list[str]: 246 + """Extract spoken-form names from entity list. 247 + 248 + Extracts shortened forms optimized for audio transcription: 249 + - First word from base name (without parentheses) 250 + - All items from within parentheses (comma-separated) 251 + 252 + Examples: 253 + - "Ryan Reed (R2)" → ["Ryan", "R2"] 254 + - "Federal Aviation Administration (FAA)" → ["Federal", "FAA"] 255 + - "Acme Corp" → ["Acme"] 256 + 257 + Args: 258 + entities: List of entity dictionaries with "name" and optional "aka" fields 259 + 260 + Returns: 261 + List of unique spoken names, preserving insertion order 262 + """ 263 + spoken_names: list[str] = [] 264 + 265 + def add_name_variants(name: str) -> None: 266 + """Extract and add first word + parenthetical items from a name.""" 267 + if not name: 268 + return 269 + 270 + # Get base name (without parens) and extract first word 271 + base_name = re.sub(r"\s*$[^)]+$", "", name).strip() 272 + first_word = base_name.split()[0] if base_name else None 273 + 274 + # Add first word 275 + if first_word and first_word not in spoken_names: 276 + spoken_names.append(first_word) 277 + 278 + # Extract and add all items from parens (comma-separated) 279 + paren_match = re.search(r"$([^)]+)$", name) 280 + if paren_match: 281 + paren_items = [item.strip() for item in paren_match.group(1).split(",")] 282 + for item in paren_items: 283 + if item and item not in spoken_names: 284 + spoken_names.append(item) 285 + 286 + for entity in entities: 287 + name = entity.get("name", "") 288 + if name: 289 + add_name_variants(name) 290 + 291 + # Process aka list with same logic 292 + aka_list = entity.get("aka", []) 293 + if isinstance(aka_list, list): 294 + for aka_name in aka_list: 295 + add_name_variants(aka_name) 296 + 297 + return spoken_names 298 + 299 + 300 + def load_entity_names( 301 + *, 302 + facet: str | None = None, 303 + spoken: bool = False, 304 + ) -> str | list[str] | None: 305 + """Load entity names from entities for AI transcription context. 306 + 307 + This function extracts just the entity names (no types or descriptions) from 308 + entity files. When spoken=False (default), returns them as a 309 + semicolon-delimited string. When spoken=True, returns a list of shortened forms 310 + optimized for audio transcription. 311 + 312 + When facet is None, loads and merges entities from ALL facets with 313 + deduplication (first occurrence wins when same name appears in multiple facets). 314 + 315 + When spoken=True, uses uniform processing for all entity types: 316 + - Extracts first word from base name (without parentheses) 317 + - Extracts all items from within parentheses (comma-separated) 318 + - Examples: 319 + - "Ryan Reed (R2)" → ["Ryan", "R2"] 320 + - "Federal Aviation Administration (FAA)" → ["Federal", "FAA"] 321 + - "Acme Corp" → ["Acme"] 322 + - "pytest" → ["pytest"] 323 + 324 + Args: 325 + facet: Optional facet name. If provided, loads from that facet only. 326 + If None, loads from ALL facets using load_all_attached_entities(). 327 + spoken: If True, returns list of shortened forms for speech recognition. 328 + If False, returns semicolon-delimited string of full names. 329 + 330 + Returns: 331 + When spoken=False: Semicolon-delimited string of entity names with aka values in parentheses 332 + (e.g., "John Smith (Johnny); Acme Corp (ACME, AcmeCo)"), 333 + or None if no entities found. 334 + When spoken=True: List of shortened entity names for speech, or None if no entities found. 335 + """ 336 + # Load entities using existing utilities 337 + if facet is None: 338 + # Load from ALL facets with deduplication 339 + entities = load_all_attached_entities() 340 + else: 341 + # Load from specific facet 342 + entities = load_entities(facet) 343 + 344 + if not entities: 345 + return None 346 + 347 + # Transform entity dicts into desired format 348 + if not spoken: 349 + # Non-spoken mode: semicolon-delimited string of full names with aka in parentheses 350 + entity_names = [] 351 + for entity in entities: 352 + name = entity.get("name", "") 353 + if name and name not in entity_names: 354 + # Check for aka values and append in parentheses 355 + aka_list = entity.get("aka", []) 356 + if isinstance(aka_list, list) and aka_list: 357 + # Format: "Name (aka1, aka2, aka3)" 358 + aka_str = ", ".join(aka_list) 359 + formatted_name = f"{name} ({aka_str})" 360 + else: 361 + formatted_name = name 362 + entity_names.append(formatted_name) 363 + return "; ".join(entity_names) if entity_names else None 364 + else: 365 + # Spoken mode: list of shortened forms 366 + spoken_names = _extract_spoken_names(entities) 367 + return spoken_names if spoken_names else None 368 + 369 + 370 + def load_recent_entity_names(*, limit: int = 20) -> list[str] | None: 371 + """Load recently active entity names for transcription context. 372 + 373 + Returns spoken-form names from the most recently seen entities across all 374 + facets. Caller is responsible for formatting the list as needed. 375 + 376 + Args: 377 + limit: Maximum number of entities to include (default 20) 378 + 379 + Returns: 380 + List of spoken-form entity names, or None if no entities found. 381 + 382 + Example: 383 + >>> load_recent_entity_names(limit=5) 384 + ["Alice", "Bob", "R2", "Acme", "FAA"] 385 + """ 386 + # Get most recently seen entities 387 + entities = load_all_attached_entities(sort_by="last_seen", limit=limit) 388 + if not entities: 389 + return None 390 + 391 + # Extract spoken names 392 + spoken_names = _extract_spoken_names(entities) 393 + if not spoken_names: 394 + return None 395 + 396 + return spoken_names

+277

think/entities/matching.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Entity matching and resolution. 5 + 6 + This module provides entity lookup functions: 7 + - find_matching_attached_entity: Low-level fuzzy matching 8 + - resolve_entity: High-level resolution with candidates 9 + - validate_aka_uniqueness: Check for aka collisions 10 + """ 11 + 12 + from think.entities.core import EntityDict, entity_slug 13 + from think.entities.loading import load_entities 14 + 15 + 16 + def validate_aka_uniqueness( 17 + aka: str, 18 + entities: list[EntityDict], 19 + exclude_entity_name: str | None = None, 20 + fuzzy_threshold: int = 90, 21 + ) -> str | None: 22 + """Check if an aka collides with another entity's name or aka. 23 + 24 + Uses the same fuzzy matching logic as find_matching_attached_entity to 25 + catch collisions that would cause ambiguous lookups. 26 + 27 + Args: 28 + aka: The alias to validate 29 + entities: List of entity dicts to check against 30 + exclude_entity_name: Entity name to exclude from checks (the entity 31 + being updated). Case-sensitive exact match. 32 + fuzzy_threshold: Minimum score for fuzzy matching (default: 90) 33 + 34 + Returns: 35 + Name of conflicting entity if collision found, None if ok 36 + 37 + Example: 38 + >>> entities = [{"name": "CTT", ...}, {"name": "Other", ...}] 39 + >>> validate_aka_uniqueness("CTT", entities, exclude_entity_name="Other") 40 + "CTT" # Conflicts with entity named "CTT" 41 + >>> validate_aka_uniqueness("ctt", entities, exclude_entity_name="CTT") 42 + None # Ok, adding to CTT's own akas 43 + """ 44 + # Filter out the entity being updated 45 + check_entities = [ 46 + e 47 + for e in entities 48 + if e.get("name") != exclude_entity_name and not e.get("detached") 49 + ] 50 + 51 + if not check_entities: 52 + return None 53 + 54 + # Use the existing matching function to detect collisions 55 + match = find_matching_attached_entity(aka, check_entities, fuzzy_threshold) 56 + if match: 57 + return match.get("name") 58 + 59 + return None 60 + 61 + 62 + def find_matching_attached_entity( 63 + detected_name: str, 64 + attached_entities: list[EntityDict], 65 + fuzzy_threshold: int = 90, 66 + ) -> EntityDict | None: 67 + """Find an attached entity matching a detected name. 68 + 69 + Uses tiered matching strategy (in order of precedence): 70 + 1. Exact name, id, or aka match 71 + 2. Case-insensitive name, id, or aka match 72 + 3. Slugified query match against id 73 + 4. First-word match (unambiguous only, min 3 chars) 74 + 5. Fuzzy match using rapidfuzz (score >= threshold) 75 + 76 + Args: 77 + detected_name: Name, id (slug), or aka to search for 78 + attached_entities: List of attached entity dicts to search 79 + fuzzy_threshold: Minimum score (0-100) for fuzzy matching (default: 90) 80 + 81 + Returns: 82 + Matched entity dict, or None if no match found 83 + 84 + Example: 85 + >>> attached = [{"id": "robert_johnson", "name": "Robert Johnson", "aka": ["Bob", "Bobby"]}] 86 + >>> find_matching_attached_entity("Bob", attached) 87 + {"id": "robert_johnson", "name": "Robert Johnson", "aka": ["Bob", "Bobby"]} 88 + >>> find_matching_attached_entity("robert_johnson", attached) 89 + {"id": "robert_johnson", "name": "Robert Johnson", "aka": ["Bob", "Bobby"]} 90 + """ 91 + if not detected_name or not attached_entities: 92 + return None 93 + 94 + detected_lower = detected_name.lower() 95 + detected_slug = entity_slug(detected_name) 96 + 97 + # Build lookup structures for efficient matching 98 + # Maps exact name/id/aka -> entity 99 + exact_map: dict[str, EntityDict] = {} 100 + # Maps id -> entity for slug matching 101 + id_map: dict[str, EntityDict] = {} 102 + # Maps lowercase first word -> list of entities (for ambiguity detection) 103 + first_word_map: dict[str, list[EntityDict]] = {} 104 + # All candidate strings for fuzzy matching -> entity 105 + fuzzy_candidates: dict[str, EntityDict] = {} 106 + 107 + for entity in attached_entities: 108 + name = entity.get("name", "") 109 + entity_id = entity.get("id", "") 110 + if not name: 111 + continue 112 + 113 + name_lower = name.lower() 114 + 115 + # Tier 1 & 2: Exact and case-insensitive for name 116 + exact_map[name] = entity 117 + exact_map[name_lower] = entity 118 + 119 + # Also add id to exact map (compute from name if not present) 120 + if entity_id: 121 + exact_map[entity_id] = entity 122 + id_map[entity_id] = entity 123 + else: 124 + # Compute slug from name for entities without id 125 + name_slug = entity_slug(name) 126 + if name_slug: 127 + id_map[name_slug] = entity 128 + 129 + # Also add akas 130 + aka_list = entity.get("aka", []) 131 + if isinstance(aka_list, list): 132 + for aka in aka_list: 133 + if aka: 134 + exact_map[aka] = entity 135 + exact_map[aka.lower()] = entity 136 + 137 + # Tier 4: First word 138 + first_word = name.split()[0].lower() if name else "" 139 + if first_word and len(first_word) >= 3: 140 + if first_word not in first_word_map: 141 + first_word_map[first_word] = [] 142 + first_word_map[first_word].append(entity) 143 + 144 + # Tier 5: Fuzzy candidates (name and akas) 145 + fuzzy_candidates[name] = entity 146 + if isinstance(aka_list, list): 147 + for aka in aka_list: 148 + if aka: 149 + fuzzy_candidates[aka] = entity 150 + 151 + # Tier 1: Exact match (name, id, or aka) 152 + if detected_name in exact_map: 153 + return exact_map[detected_name] 154 + 155 + # Tier 2: Case-insensitive match 156 + if detected_lower in exact_map: 157 + return exact_map[detected_lower] 158 + 159 + # Tier 3: Slugified query match against id 160 + if detected_slug and detected_slug in id_map: 161 + return id_map[detected_slug] 162 + 163 + # Tier 4: First-word match (only if unambiguous) 164 + if len(detected_name) >= 3: 165 + matches = first_word_map.get(detected_lower, []) 166 + if len(matches) == 1: 167 + return matches[0] 168 + 169 + # Tier 5: Fuzzy match 170 + if len(detected_name) >= 4 and fuzzy_candidates: 171 + try: 172 + from rapidfuzz import fuzz, process 173 + 174 + result = process.extractOne( 175 + detected_name, 176 + fuzzy_candidates.keys(), 177 + scorer=fuzz.token_sort_ratio, 178 + score_cutoff=fuzzy_threshold, 179 + ) 180 + if result: 181 + matched_str, _score, _index = result 182 + return fuzzy_candidates[matched_str] 183 + except ImportError: 184 + # rapidfuzz not available, skip fuzzy matching 185 + pass 186 + 187 + return None 188 + 189 + 190 + def resolve_entity( 191 + facet: str, 192 + query: str, 193 + fuzzy_threshold: int = 90, 194 + include_detached: bool = False, 195 + ) -> tuple[EntityDict | None, list[EntityDict] | None]: 196 + """Resolve an entity query to a single attached entity. 197 + 198 + This is the primary entry point for MCP tools to look up entities. 199 + Accepts any form of entity reference (name, id/slug, aka) and resolves 200 + to a single unambiguous entity. 201 + 202 + Uses tiered matching strategy: 203 + 1. Exact name, id, or aka match 204 + 2. Case-insensitive match 205 + 3. Slugified query match against id 206 + 4. First-word match (only if unambiguous) 207 + 5. Fuzzy match (if single result above threshold) 208 + 209 + Args: 210 + facet: Facet name (e.g., "personal", "work") 211 + query: Name, id (slug), or aka to search for 212 + fuzzy_threshold: Minimum score (0-100) for fuzzy matching (default: 90) 213 + include_detached: If True, also search detached entities (default: False) 214 + 215 + Returns: 216 + Tuple of (entity, candidates): 217 + - If found: (entity_dict, None) 218 + - If not found: (None, list of closest candidates) 219 + - If ambiguous: (None, list of matching candidates) 220 + 221 + Examples: 222 + >>> entity, _ = resolve_entity("work", "Alice Johnson") 223 + >>> entity, _ = resolve_entity("work", "alice_johnson") # by id 224 + >>> entity, _ = resolve_entity("work", "Ali") # by aka 225 + >>> _, candidates = resolve_entity("work", "unknown") # not found 226 + """ 227 + if not query or not query.strip(): 228 + return None, [] 229 + 230 + # Load attached entities 231 + entities = load_entities(facet, day=None, include_detached=include_detached) 232 + if not entities: 233 + return None, [] 234 + 235 + # Try to find a match 236 + match = find_matching_attached_entity(query, entities, fuzzy_threshold) 237 + if match: 238 + return match, None 239 + 240 + # No match found - find closest candidates for error message 241 + # Get top fuzzy matches as suggestions 242 + candidates: list[EntityDict] = [] 243 + 244 + try: 245 + from rapidfuzz import fuzz, process 246 + 247 + # Build candidate strings 248 + fuzzy_candidates: dict[str, EntityDict] = {} 249 + for entity in entities: 250 + name = entity.get("name", "") 251 + if name: 252 + fuzzy_candidates[name] = entity 253 + aka_list = entity.get("aka", []) 254 + if isinstance(aka_list, list): 255 + for aka in aka_list: 256 + if aka: 257 + fuzzy_candidates[aka] = entity 258 + 259 + # Get top 3 matches regardless of threshold 260 + results = process.extract( 261 + query, 262 + fuzzy_candidates.keys(), 263 + scorer=fuzz.token_sort_ratio, 264 + limit=3, 265 + ) 266 + seen_names: set[str] = set() 267 + for matched_str, _score, _index in results: 268 + entity = fuzzy_candidates[matched_str] 269 + name = entity.get("name", "") 270 + if name and name not in seen_names: 271 + seen_names.add(name) 272 + candidates.append(entity) 273 + except ImportError: 274 + # rapidfuzz not available, return first few entities as candidates 275 + candidates = entities[:3] 276 + 277 + return None, candidates

+156

think/entities/observations.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Entity observations management. 5 + 6 + Observations are durable factoids about entities stored in: 7 + facets/<facet>/entities/<id>/observations.jsonl 8 + 9 + They capture useful information like preferences, expertise, relationships, 10 + and biographical facts that help with future interactions. 11 + """ 12 + 13 + import json 14 + import time 15 + from pathlib import Path 16 + from typing import Any 17 + 18 + from think.entities.core import atomic_write 19 + from think.entities.relationships import entity_memory_path 20 + 21 + 22 + class ObservationNumberError(Exception): 23 + """Raised when observation_number doesn't match expected value.""" 24 + 25 + def __init__(self, expected: int, actual: int): 26 + self.expected = expected 27 + self.actual = actual 28 + super().__init__( 29 + f"Observation number mismatch: expected {expected}, got {actual}" 30 + ) 31 + 32 + 33 + def observations_file_path(facet: str, name: str) -> Path: 34 + """Return path to observations file for an entity. 35 + 36 + Observations are stored in the entity's memory folder: 37 + facets/{facet}/entities/{entity_slug}/observations.jsonl 38 + 39 + Args: 40 + facet: Facet name (e.g., "personal", "work") 41 + name: Entity name (will be slugified) 42 + 43 + Returns: 44 + Path to observations.jsonl file 45 + 46 + Raises: 47 + ValueError: If name slugifies to empty string 48 + """ 49 + folder = entity_memory_path(facet, name) 50 + return folder / "observations.jsonl" 51 + 52 + 53 + def load_observations(facet: str, name: str) -> list[dict[str, Any]]: 54 + """Load observations for an entity. 55 + 56 + Args: 57 + facet: Facet name 58 + name: Entity name 59 + 60 + Returns: 61 + List of observation dictionaries with content, observed_at, source_day keys. 62 + Returns empty list if file doesn't exist. 63 + 64 + Example: 65 + >>> load_observations("work", "Alice Johnson") 66 + [{"content": "Prefers async communication", "observed_at": 1736784000000, "source_day": "20250113"}] 67 + """ 68 + path = observations_file_path(facet, name) 69 + 70 + if not path.exists(): 71 + return [] 72 + 73 + observations = [] 74 + with open(path, "r", encoding="utf-8") as f: 75 + for line in f: 76 + line = line.strip() 77 + if not line: 78 + continue 79 + try: 80 + data = json.loads(line) 81 + observations.append(data) 82 + except json.JSONDecodeError: 83 + continue # Skip malformed lines 84 + 85 + return observations 86 + 87 + 88 + def save_observations( 89 + facet: str, name: str, observations: list[dict[str, Any]] 90 + ) -> None: 91 + """Save observations to entity's observations file using atomic write. 92 + 93 + Args: 94 + facet: Facet name 95 + name: Entity name 96 + observations: List of observation dictionaries 97 + """ 98 + path = observations_file_path(facet, name) 99 + 100 + # Format observations as JSONL 101 + content = "".join(json.dumps(obs, ensure_ascii=False) + "\n" for obs in observations) 102 + atomic_write(path, content, prefix=".observations_") 103 + 104 + 105 + def add_observation( 106 + facet: str, 107 + name: str, 108 + content: str, 109 + observation_number: int, 110 + source_day: str | None = None, 111 + ) -> dict[str, Any]: 112 + """Add an observation to an entity with guard validation. 113 + 114 + Similar to todo_add, requires the caller to provide the expected next 115 + observation number (current count + 1) to prevent stale writes. 116 + 117 + Args: 118 + facet: Facet name 119 + name: Entity name 120 + content: The observation text 121 + observation_number: Expected next number; must be current_count + 1 122 + source_day: Optional day (YYYYMMDD) when observation was made 123 + 124 + Returns: 125 + Dictionary with updated observations list and count 126 + 127 + Raises: 128 + ObservationNumberError: If observation_number doesn't match expected 129 + ValueError: If content is empty 130 + 131 + Example: 132 + >>> add_observation("work", "Alice", "Prefers morning meetings", 1, "20250113") 133 + {"observations": [...], "count": 1} 134 + """ 135 + content = content.strip() 136 + if not content: 137 + raise ValueError("Observation content cannot be empty") 138 + 139 + observations = load_observations(facet, name) 140 + expected = len(observations) + 1 141 + 142 + if observation_number != expected: 143 + raise ObservationNumberError(expected, observation_number) 144 + 145 + # Create new observation 146 + observation: dict[str, Any] = { 147 + "content": content, 148 + "observed_at": int(time.time() * 1000), 149 + } 150 + if source_day: 151 + observation["source_day"] = source_day 152 + 153 + observations.append(observation) 154 + save_observations(facet, name, observations) 155 + 156 + return {"observations": observations, "count": len(observations)}

+218

think/entities/relationships.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Facet relationship management and entity memory. 5 + 6 + Facet relationships link journal entities to specific facets with context: 7 + facets/<facet>/entities/<id>/entity.json 8 + 9 + Entity memory (observations, voiceprints) is stored alongside relationships: 10 + facets/<facet>/entities/<id>/observations.jsonl 11 + facets/<facet>/entities/<id>/voiceprints.npz 12 + """ 13 + 14 + import json 15 + import shutil 16 + from pathlib import Path 17 + 18 + from think.entities.core import EntityDict, atomic_write, entity_slug 19 + from think.utils import get_journal 20 + 21 + 22 + def facet_relationship_path(facet: str, entity_id: str) -> Path: 23 + """Return path to facet relationship file. 24 + 25 + Args: 26 + facet: Facet name 27 + entity_id: Entity ID (slug) 28 + 29 + Returns: 30 + Path to facets/<facet>/entities/<id>/entity.json 31 + """ 32 + return ( 33 + Path(get_journal()) / "facets" / facet / "entities" / entity_id / "entity.json" 34 + ) 35 + 36 + 37 + def load_facet_relationship(facet: str, entity_id: str) -> EntityDict | None: 38 + """Load a facet relationship for an entity. 39 + 40 + Args: 41 + facet: Facet name 42 + entity_id: Entity ID (slug) 43 + 44 + Returns: 45 + Relationship dict with entity_id, description, timestamps, etc., 46 + or None if not found. 47 + """ 48 + path = facet_relationship_path(facet, entity_id) 49 + if not path.exists(): 50 + return None 51 + 52 + try: 53 + with open(path, "r", encoding="utf-8") as f: 54 + data = json.load(f) 55 + # Ensure entity_id is present 56 + data["entity_id"] = entity_id 57 + return data 58 + except (json.JSONDecodeError, OSError): 59 + return None 60 + 61 + 62 + def save_facet_relationship( 63 + facet: str, entity_id: str, relationship: EntityDict 64 + ) -> None: 65 + """Save a facet relationship using atomic write. 66 + 67 + Creates the directory if needed. 68 + 69 + Args: 70 + facet: Facet name 71 + entity_id: Entity ID (slug) 72 + relationship: Relationship dict with description, timestamps, etc. 73 + """ 74 + path = facet_relationship_path(facet, entity_id) 75 + 76 + # Ensure entity_id is in the relationship 77 + relationship["entity_id"] = entity_id 78 + 79 + content = json.dumps(relationship, ensure_ascii=False, indent=2) + "\n" 80 + atomic_write(path, content, prefix=".relationship_") 81 + 82 + 83 + def scan_facet_relationships(facet: str) -> list[str]: 84 + """List all entity IDs with relationships in a facet. 85 + 86 + Scans facets/<facet>/entities/ for subdirectories containing entity.json. 87 + 88 + Args: 89 + facet: Facet name 90 + 91 + Returns: 92 + List of entity IDs (directory names) 93 + """ 94 + entities_dir = Path(get_journal()) / "facets" / facet / "entities" 95 + if not entities_dir.exists(): 96 + return [] 97 + 98 + entity_ids = [] 99 + for entry in entities_dir.iterdir(): 100 + if entry.is_dir() and (entry / "entity.json").exists(): 101 + entity_ids.append(entry.name) 102 + 103 + return sorted(entity_ids) 104 + 105 + 106 + def enrich_relationship_with_journal( 107 + relationship: EntityDict, 108 + journal_entity: EntityDict | None, 109 + ) -> EntityDict: 110 + """Merge journal entity fields into relationship for unified view. 111 + 112 + Creates a combined entity dict that has identity fields (name, type, aka, 113 + is_principal) from journal and relationship fields (description, timestamps, 114 + etc.) from facet. 115 + 116 + Args: 117 + relationship: Facet relationship dict 118 + journal_entity: Journal-level entity dict (or None) 119 + 120 + Returns: 121 + Merged entity dict with all fields 122 + """ 123 + # Start with relationship data 124 + result = dict(relationship) 125 + 126 + # Add identity fields from journal entity 127 + if journal_entity: 128 + result["id"] = journal_entity.get("id", relationship.get("entity_id", "")) 129 + result["name"] = journal_entity.get("name", "") 130 + result["type"] = journal_entity.get("type", "") 131 + if journal_entity.get("aka"): 132 + result["aka"] = journal_entity["aka"] 133 + if journal_entity.get("is_principal"): 134 + result["is_principal"] = True 135 + else: 136 + # No journal entity - use entity_id as id 137 + result["id"] = relationship.get("entity_id", "") 138 + 139 + # Remove entity_id from result (use id instead) 140 + result.pop("entity_id", None) 141 + 142 + return result 143 + 144 + 145 + def entity_memory_path(facet: str, name: str) -> Path: 146 + """Return path to entity's memory folder. 147 + 148 + Entity memory folders store persistent data about attached entities: 149 + observations (durable facts), voiceprints (voice recognition), etc. 150 + 151 + Args: 152 + facet: Facet name (e.g., "personal", "work") 153 + name: Entity name (will be slugified) 154 + 155 + Returns: 156 + Path to facets/{facet}/entities/{entity_slug}/ 157 + 158 + Raises: 159 + ValueError: If name slugifies to empty string 160 + """ 161 + slug = entity_slug(name) 162 + if not slug: 163 + raise ValueError(f"Entity name '{name}' slugifies to empty string") 164 + 165 + return Path(get_journal()) / "facets" / facet / "entities" / slug 166 + 167 + 168 + def ensure_entity_memory(facet: str, name: str) -> Path: 169 + """Create entity memory folder if needed, return path. 170 + 171 + Args: 172 + facet: Facet name (e.g., "personal", "work") 173 + name: Entity name (will be slugified) 174 + 175 + Returns: 176 + Path to the created/existing folder 177 + 178 + Raises: 179 + ValueError: If name slugifies to empty string 180 + """ 181 + folder = entity_memory_path(facet, name) 182 + folder.mkdir(parents=True, exist_ok=True) 183 + return folder 184 + 185 + 186 + def rename_entity_memory(facet: str, old_name: str, new_name: str) -> bool: 187 + """Rename entity memory folder if it exists. 188 + 189 + Called when an entity is renamed to keep folder in sync. 190 + 191 + Args: 192 + facet: Facet name 193 + old_name: Previous entity name 194 + new_name: New entity name 195 + 196 + Returns: 197 + True if folder was renamed, False if old folder didn't exist 198 + or names slugify to the same value 199 + 200 + Raises: 201 + ValueError: If either name slugifies to empty string 202 + OSError: If rename fails (e.g., target exists) 203 + """ 204 + old_folder = entity_memory_path(facet, old_name) 205 + new_folder = entity_memory_path(facet, new_name) 206 + 207 + # No rename needed if slugified names are the same 208 + if old_folder == new_folder: 209 + return False 210 + 211 + if not old_folder.exists(): 212 + return False 213 + 214 + if new_folder.exists(): 215 + raise OSError(f"Target folder already exists: {new_folder}") 216 + 217 + shutil.move(str(old_folder), str(new_folder)) 218 + return True

+205

think/entities/saving.py

··· 1 + # SPDX-License-Identifier: AGPL-3.0-only 2 + # Copyright (c) 2026 sol pbc 3 + 4 + """Entity saving functions. 5 + 6 + This module handles saving entities to storage: 7 + - save_entities: Save attached or detected entities for a facet 8 + - update_entity_description: Update a single entity's description with guard 9 + """ 10 + 11 + import json 12 + import time 13 + 14 + from think.entities.core import EntityDict, atomic_write, entity_slug 15 + from think.entities.journal import get_or_create_journal_entity, save_journal_entity 16 + from think.entities.loading import detected_entities_path, load_entities 17 + from think.entities.relationships import save_facet_relationship 18 + 19 + 20 + def _save_entities_detected( 21 + facet: str, entities: list[EntityDict], day: str 22 + ) -> None: 23 + """Save detected entities to day-specific JSONL file.""" 24 + path = detected_entities_path(facet, day) 25 + 26 + # Ensure id field is present 27 + for entity in entities: 28 + name = entity.get("name", "") 29 + expected_id = entity_slug(name) 30 + if entity.get("id") != expected_id: 31 + entity["id"] = expected_id 32 + 33 + # Sort by type, then name for consistency 34 + sorted_entities = sorted( 35 + entities, key=lambda e: (e.get("type", ""), e.get("name", "")) 36 + ) 37 + 38 + # Format as JSONL and write atomically 39 + content = "".join(json.dumps(e, ensure_ascii=False) + "\n" for e in sorted_entities) 40 + atomic_write(path, content, prefix=".entities_") 41 + 42 + 43 + def _save_entities_attached(facet: str, entities: list[EntityDict]) -> None: 44 + """Save attached entities to new structure (journal entities + facet relationships).""" 45 + # Validate uniqueness 46 + seen_names: set[str] = set() 47 + seen_ids: set[str] = set() 48 + 49 + for entity in entities: 50 + name = entity.get("name", "") 51 + expected_id = entity_slug(name) 52 + 53 + # Set or update id 54 + if entity.get("id") != expected_id: 55 + entity["id"] = expected_id 56 + 57 + name_lower = name.lower() 58 + if name_lower in seen_names: 59 + raise ValueError(f"Duplicate entity name '{name}' in facet '{facet}'") 60 + seen_names.add(name_lower) 61 + 62 + if expected_id in seen_ids: 63 + raise ValueError( 64 + f"Duplicate entity id '{expected_id}' in facet '{facet}' " 65 + f"(names may slugify to same value)" 66 + ) 67 + seen_ids.add(expected_id) 68 + 69 + # Fields that belong to journal entity (identity) 70 + journal_fields = {"id", "name", "type", "aka", "is_principal", "created_at"} 71 + 72 + # Process each entity 73 + for entity in entities: 74 + entity_id = entity["id"] 75 + name = entity.get("name", "") 76 + entity_type = entity.get("type", "") 77 + aka = entity.get("aka") 78 + is_detached = entity.get("detached", False) 79 + 80 + # Ensure journal entity exists (creates if needed, preserves if exists) 81 + # Skip principal flagging for detached entities 82 + journal_entity = get_or_create_journal_entity( 83 + entity_id=entity_id, 84 + name=name, 85 + entity_type=entity_type, 86 + aka=aka if isinstance(aka, list) else None, 87 + skip_principal=is_detached, 88 + ) 89 + 90 + # Update journal entity if name/type/aka changed 91 + journal_updated = False 92 + if journal_entity.get("name") != name: 93 + journal_entity["name"] = name 94 + journal_updated = True 95 + if journal_entity.get("type") != entity_type: 96 + journal_entity["type"] = entity_type 97 + journal_updated = True 98 + if aka and isinstance(aka, list): 99 + # Merge aka lists (union) 100 + existing_aka = set(journal_entity.get("aka", [])) 101 + new_aka = existing_aka | set(aka) 102 + if new_aka != existing_aka: 103 + journal_entity["aka"] = sorted(new_aka) 104 + journal_updated = True 105 + # Only propagate is_principal if explicitly set and entity not detached 106 + if ( 107 + entity.get("is_principal") 108 + and not is_detached 109 + and not journal_entity.get("is_principal") 110 + ): 111 + journal_entity["is_principal"] = True 112 + journal_updated = True 113 + 114 + if journal_updated: 115 + save_journal_entity(journal_entity) 116 + 117 + # Build relationship record (all non-identity fields) 118 + relationship: EntityDict = { 119 + "entity_id": entity_id, 120 + } 121 + for key, value in entity.items(): 122 + if key not in journal_fields: 123 + relationship[key] = value 124 + 125 + # Save facet relationship 126 + save_facet_relationship(facet, entity_id, relationship) 127 + 128 + 129 + def save_entities( 130 + facet: str, entities: list[EntityDict], day: str | None = None 131 + ) -> None: 132 + """Save entities to storage. 133 + 134 + For detected entities (day provided), writes to day-specific JSONL files. 135 + For attached entities (day=None), writes to: 136 + - Journal-level entity files: entities/<id>/entity.json (identity) 137 + - Facet relationship files: facets/<facet>/entities/<id>/entity.json 138 + 139 + Ensures all entities have an `id` field (generates from name if missing). 140 + For attached entities, validates name uniqueness within the facet. 141 + 142 + Args: 143 + facet: Facet name 144 + entities: List of entity dictionaries (must have type, name, description keys; 145 + attached entities may also have id, attached_at, updated_at timestamps) 146 + day: Optional day in YYYYMMDD format for detected entities 147 + 148 + Raises: 149 + ValueError: If duplicate names found in attached entities (day=None) 150 + """ 151 + if day is not None: 152 + _save_entities_detected(facet, entities, day) 153 + else: 154 + _save_entities_attached(facet, entities) 155 + 156 + 157 + def update_entity_description( 158 + facet: str, 159 + name: str, 160 + old_description: str, 161 + new_description: str, 162 + day: str | None = None, 163 + ) -> EntityDict: 164 + """Update an entity's description after validating current state. 165 + 166 + Sets updated_at timestamp to current time on successful update. 167 + 168 + Args: 169 + facet: Facet name 170 + name: Entity name to match (unique within facet) 171 + old_description: Current description (guard - must match) 172 + new_description: New description to set 173 + day: Optional day for detected entities 174 + 175 + Returns: 176 + The updated entity dict 177 + 178 + Raises: 179 + ValueError: If entity not found or guard mismatch 180 + """ 181 + # Load ALL entities including detached to avoid data loss on save 182 + # For attached entities (day=None), we need include_detached=True 183 + entities = ( 184 + load_entities(facet, day, include_detached=True) 185 + if day is None 186 + else load_entities(facet, day) 187 + ) 188 + 189 + for entity in entities: 190 + # Skip detached entities when searching 191 + if entity.get("detached"): 192 + continue 193 + if entity.get("name") == name: 194 + current_desc = entity.get("description", "") 195 + if current_desc != old_description: 196 + raise ValueError( 197 + f"Description mismatch for '{name}': expected '{old_description}', " 198 + f"found '{current_desc}'" 199 + ) 200 + entity["description"] = new_description 201 + entity["updated_at"] = int(time.time() * 1000) 202 + save_entities(facet, entities, day) 203 + return entity 204 + 205 + raise ValueError(f"Entity '{name}' not found in facet '{facet}'")

+1 -8

think/formatters.py

··· 118 118 # JSONL formatters 119 119 "agents/*.jsonl": ("muse.cortex", "format_agent"), 120 120 "config/actions/*.jsonl": ("think.facets", "format_logs"), 121 - "facets/*/entities/*.jsonl": ("think.entities", "format_entities"), 122 - "facets/*/entities.jsonl": ("think.entities", "format_entities"), 121 + "facets/*/entities/*.jsonl": ("think.entities.formatting", "format_entities"), 123 122 "facets/*/events/*.jsonl": ("think.events", "format_events"), 124 123 "facets/*/todos/*.jsonl": ("apps.todos.todo", "format_todos"), 125 124 "facets/*/logs/*.jsonl": ("think.facets", "format_logs"), ··· 271 270 for jsonl_file in events_dir.glob("*.jsonl"): 272 271 rel = f"facets/{facet_name}/events/{jsonl_file.name}" 273 272 files[rel] = str(jsonl_file) 274 - 275 - # Entities attached: facets/*/entities.jsonl 276 - entities_file = facet_dir / "entities.jsonl" 277 - if entities_file.is_file(): 278 - rel = f"facets/{facet_name}/entities.jsonl" 279 - files[rel] = str(entities_file) 280 273 281 274 # Entities detected: facets/*/entities/*.jsonl 282 275 entities_dir = facet_dir / "entities"

Configure Feed

Configure Feed