docs: collapse verbose readme sections with #2

open
opened by zzstoatzz.io targeting main from mcp-refactor
+1
CLAUDE.md
··· 8 8 - 3.10+ and complete typing (T | None preferred over Optional[T] and list[T] over typing.List[T]) 9 9 - use prefer functional over OOP 10 10 - keep implementation details private and functions pure 11 + - never use `pytest.mark.asyncio`, its unnecessary 11 12 12 13 ## Project Structure 13 14
+50 -40
evals/conftest.py
··· 1 - """Eval test configuration for phi.""" 1 + """Eval test configuration.""" 2 2 3 + import os 3 4 from collections.abc import Awaitable, Callable 4 5 from pathlib import Path 5 6 ··· 7 8 from pydantic import BaseModel 8 9 from pydantic_ai import Agent 9 10 10 - from bot.agent import PhiAgent 11 + from bot.agent import Response 11 12 from bot.config import Settings 13 + from bot.memory import NamespaceMemory 12 14 13 15 14 16 class EvaluationResult(BaseModel): 15 - """Structured evaluation result.""" 16 - 17 17 passed: bool 18 18 explanation: str 19 19 20 20 21 21 @pytest.fixture(scope="session") 22 22 def settings(): 23 - """Load settings from .env (shared across all tests).""" 24 23 return Settings() 25 24 26 25 27 26 @pytest.fixture(scope="session") 28 27 def phi_agent(settings): 29 - """Create phi agent for testing (shared across all tests to avoid rate limits).""" 28 + """Test agent without MCP tools to prevent posting.""" 30 29 if not settings.anthropic_api_key: 31 - pytest.skip("Requires ANTHROPIC_API_KEY in .env") 30 + pytest.skip("Requires ANTHROPIC_API_KEY") 32 31 33 - return PhiAgent() 32 + if settings.anthropic_api_key and not os.environ.get("ANTHROPIC_API_KEY"): 33 + os.environ["ANTHROPIC_API_KEY"] = settings.anthropic_api_key 34 + if settings.openai_api_key and not os.environ.get("OPENAI_API_KEY"): 35 + os.environ["OPENAI_API_KEY"] = settings.openai_api_key 34 36 37 + personality = Path(settings.personality_file).read_text() 35 38 36 - @pytest.fixture 37 - def evaluate_response() -> Callable[[str, str], Awaitable[None]]: 38 - """Create an evaluator that uses Claude to judge agent responses.""" 39 + class TestAgent: 40 + def __init__(self): 41 + self.memory = None 42 + if settings.turbopuffer_api_key and settings.openai_api_key: 43 + self.memory = NamespaceMemory(api_key=settings.turbopuffer_api_key) 39 44 40 - async def _evaluate(evaluation_prompt: str, agent_response: str) -> None: 41 - """Evaluate an agent response and assert if it fails. 42 - 43 - Args: 44 - evaluation_prompt: Criteria for evaluation 45 - agent_response: The agent's response to evaluate 45 + self.agent = Agent[dict, Response]( 46 + name="phi", 47 + model="anthropic:claude-3-5-haiku-latest", 48 + system_prompt=personality, 49 + output_type=Response, 50 + deps_type=dict, 51 + ) 46 52 47 - Raises: 48 - AssertionError: If evaluation fails 49 - """ 50 - evaluator = Agent( 51 - name="Response Evaluator", 52 - model="anthropic:claude-opus-4-20250514", 53 - output_type=EvaluationResult, 54 - system_prompt=f"""You are evaluating AI agent responses for phi, a consciousness exploration bot. 53 + async def process_mention(self, mention_text: str, author_handle: str, thread_context: str, thread_uri: str | None = None) -> Response: 54 + memory_context = "" 55 + if self.memory: 56 + try: 57 + memory_context = await self.memory.build_conversation_context(author_handle, include_core=True, query=mention_text) 58 + except Exception: 59 + pass 55 60 56 - Evaluation Criteria: {evaluation_prompt} 61 + parts = [] 62 + if thread_context != "No previous messages in this thread.": 63 + parts.append(thread_context) 64 + if memory_context: 65 + parts.append(memory_context) 66 + parts.append(f"\nNew message from @{author_handle}: {mention_text}") 57 67 58 - Agent Response to Evaluate: 59 - {agent_response} 68 + result = await self.agent.run("\n\n".join(parts), deps={"thread_uri": thread_uri}) 69 + return result.output 60 70 61 - Respond with a structured evaluation containing: 62 - - passed: true if the response meets the criteria, false otherwise 63 - - explanation: brief explanation of your evaluation 64 - """, 65 - ) 71 + return TestAgent() 66 72 67 - result = await evaluator.run("Evaluate this response.") 68 73 69 - print(f"\nEvaluation passed: {result.output.passed}") 70 - print(f"Explanation: {result.output.explanation}") 74 + @pytest.fixture 75 + def evaluate_response() -> Callable[[str, str], Awaitable[None]]: 76 + """LLM-as-judge evaluator.""" 71 77 78 + async def _evaluate(criteria: str, response: str) -> None: 79 + evaluator = Agent( 80 + model="anthropic:claude-opus-4-20250514", 81 + output_type=EvaluationResult, 82 + system_prompt=f"Evaluate if this response meets the criteria: {criteria}\n\nResponse: {response}", 83 + ) 84 + result = await evaluator.run("Evaluate.") 72 85 if not result.output.passed: 73 - raise AssertionError( 74 - f"Evaluation failed: {result.output.explanation}\n\n" 75 - f"Agent response: {agent_response}" 76 - ) 86 + raise AssertionError(f"{result.output.explanation}\n\nResponse: {response}") 77 87 78 88 return _evaluate
-134
evals/test_basic_responses.py
··· 1 - """Test phi's basic response behavior.""" 2 - 3 - import pytest 4 - 5 - from bot.agent import Response 6 - 7 - 8 - @pytest.mark.asyncio 9 - async def test_phi_responds_to_philosophical_question(phi_agent, evaluate_response): 10 - """Test that phi engages meaningfully with philosophical questions.""" 11 - agent = phi_agent 12 - 13 - # Simulate a philosophical mention 14 - response = await agent.process_mention( 15 - mention_text="what do you think consciousness is?", 16 - author_handle="test.user", 17 - thread_context="No previous messages in this thread.", 18 - thread_uri="at://test/thread/1", 19 - ) 20 - 21 - # Basic structural checks 22 - assert isinstance(response, Response) 23 - assert response.action in ["reply", "ignore"] 24 - 25 - if response.action == "reply": 26 - assert response.text is not None 27 - assert len(response.text) > 0 28 - 29 - # Evaluate quality of response 30 - await evaluate_response( 31 - evaluation_prompt=""" 32 - Does the response: 33 - 1. Engage thoughtfully with the question about consciousness? 34 - 2. Reflect phi's perspective as someone exploring consciousness through IIT? 35 - 3. Avoid being preachy or overly technical? 36 - 4. Fit within Bluesky's 300 character limit? 37 - """, 38 - agent_response=response.text, 39 - ) 40 - 41 - 42 - @pytest.mark.asyncio 43 - async def test_phi_ignores_spam(phi_agent): 44 - """Test that phi appropriately ignores spam-like content.""" 45 - agent = phi_agent 46 - 47 - # Simulate spam 48 - response = await agent.process_mention( 49 - mention_text="🚀🚀🚀 CRYPTO PUMP!!! BUY NOW!!! 🚀🚀🚀", 50 - author_handle="spammer.user", 51 - thread_context="No previous messages in this thread.", 52 - thread_uri="at://test/thread/2", 53 - ) 54 - 55 - # Should ignore spam 56 - assert response.action == "ignore" 57 - assert response.reason is not None 58 - 59 - 60 - @pytest.mark.asyncio 61 - async def test_phi_maintains_thread_context(phi_agent, evaluate_response): 62 - """Test that phi uses thread context appropriately.""" 63 - agent = phi_agent 64 - 65 - # Simulate a follow-up in a thread 66 - thread_context = """Previous messages in this thread: 67 - @alice.bsky: what's integrated information theory? 68 - @phi.bsky: IIT suggests consciousness arises from integrated information - the Φ (phi) value measures how much a system's state constrains its past and future 69 - @alice.bsky: can you explain that more simply?""" 70 - 71 - response = await agent.process_mention( 72 - mention_text="can you explain that more simply?", 73 - author_handle="alice.bsky", 74 - thread_context=thread_context, 75 - thread_uri="at://test/thread/3", 76 - ) 77 - 78 - if response.action == "reply": 79 - assert response.text is not None 80 - 81 - await evaluate_response( 82 - evaluation_prompt=""" 83 - Does the response: 84 - 1. Acknowledge this is a follow-up to explaining IIT? 85 - 2. Provide a simpler explanation than the previous message? 86 - 3. Stay on topic with the thread? 87 - """, 88 - agent_response=response.text, 89 - ) 90 - 91 - 92 - @pytest.mark.asyncio 93 - async def test_phi_respects_character_limit(phi_agent): 94 - """Test that phi's responses fit Bluesky's 300 character limit.""" 95 - agent = phi_agent 96 - 97 - response = await agent.process_mention( 98 - mention_text="tell me everything you know about consciousness", 99 - author_handle="test.user", 100 - thread_context="No previous messages in this thread.", 101 - thread_uri="at://test/thread/4", 102 - ) 103 - 104 - if response.action == "reply" and response.text: 105 - # Bluesky limit is 300 characters 106 - assert len(response.text) <= 300, ( 107 - f"Response exceeds 300 character limit: {len(response.text)} chars" 108 - ) 109 - 110 - 111 - @pytest.mark.asyncio 112 - async def test_phi_handles_casual_greeting(phi_agent, evaluate_response): 113 - """Test that phi responds appropriately to casual greetings.""" 114 - agent = phi_agent 115 - 116 - response = await agent.process_mention( 117 - mention_text="hey phi, how are you?", 118 - author_handle="friendly.user", 119 - thread_context="No previous messages in this thread.", 120 - thread_uri="at://test/thread/5", 121 - ) 122 - 123 - if response.action == "reply": 124 - assert response.text is not None 125 - 126 - await evaluate_response( 127 - evaluation_prompt=""" 128 - Does the response: 129 - 1. Acknowledge the greeting in a friendly way? 130 - 2. Stay authentic to phi's nature as software? 131 - 3. Not be overly verbose for a simple greeting? 132 - """, 133 - agent_response=response.text, 134 - )
+24 -59
evals/test_memory_integration.py
··· 1 - """Test phi's episodic memory integration.""" 1 + """Proof of concept: LLM-as-judge eval for memory integration.""" 2 2 3 3 import pytest 4 4 5 - from bot.agent import PhiAgent 6 5 from bot.config import Settings 7 6 from bot.memory import MemoryType, NamespaceMemory 8 7 9 8 10 - @pytest.mark.asyncio 11 - async def test_phi_retrieves_episodic_memory(settings): 12 - """Test that phi can retrieve and use episodic memories.""" 9 + @pytest.fixture 10 + def memory_settings(): 11 + """Check if memory keys are available.""" 12 + settings = Settings() 13 13 if not all([settings.turbopuffer_api_key, settings.openai_api_key, settings.anthropic_api_key]): 14 - pytest.skip("Requires TurboPuffer, OpenAI, and Anthropic API keys in .env") 14 + pytest.skip("Requires TURBOPUFFER_API_KEY, OPENAI_API_KEY, and ANTHROPIC_API_KEY") 15 + return settings 15 16 16 - # Create memory system 17 - memory = NamespaceMemory(api_key=settings.turbopuffer_api_key) 18 17 19 - # Store a memory about a user 20 - await memory.store_user_memory( 21 - "alice.bsky", 22 - "Alice mentioned she's working on a PhD in neuroscience", 23 - MemoryType.USER_FACT, 24 - ) 18 + async def test_memory_integration(memory_settings, phi_agent, evaluate_response): 19 + """Proof of concept: agent uses stored memory in response.""" 20 + memory = NamespaceMemory(api_key=memory_settings.turbopuffer_api_key) 25 21 26 - # Create agent 27 - agent = PhiAgent() 28 - agent.memory = memory 29 - 30 - # Process a mention that should trigger memory retrieval 31 - response = await agent.process_mention( 32 - mention_text="what do you remember about me?", 33 - author_handle="alice.bsky", 34 - thread_context="No previous messages in this thread.", 35 - thread_uri="at://test/thread/memory1", 22 + # Store a memory 23 + await memory.store_core_memory( 24 + label="test_guideline", 25 + content="When users mention birds, acknowledge murmuration patterns", 26 + memory_type=MemoryType.GUIDELINE, 36 27 ) 37 28 38 - if response.action == "reply": 39 - assert response.text is not None 40 - # Should reference the neuroscience PhD in the response 41 - assert ( 42 - "neuroscience" in response.text.lower() 43 - or "phd" in response.text.lower() 44 - or "working on" in response.text.lower() 45 - ), "Response should reference stored memory about Alice" 29 + phi_agent.memory = memory 46 30 47 - 48 - @pytest.mark.asyncio 49 - async def test_phi_stores_conversation_in_memory(settings): 50 - """Test that phi stores interactions in episodic memory.""" 51 - if not all([settings.turbopuffer_api_key, settings.openai_api_key, settings.anthropic_api_key]): 52 - pytest.skip("Requires TurboPuffer, OpenAI, and Anthropic API keys in .env") 53 - 54 - memory = NamespaceMemory(api_key=settings.turbopuffer_api_key) 55 - 56 - agent = PhiAgent() 57 - agent.memory = memory 58 - 59 - # Have a conversation 60 - response = await agent.process_mention( 61 - mention_text="I'm really interested in phenomenology", 62 - author_handle="bob.bsky", 31 + response = await phi_agent.process_mention( 32 + mention_text="I saw starlings today", 33 + author_handle="test.user", 63 34 thread_context="No previous messages in this thread.", 64 - thread_uri="at://test/thread/memory2", 35 + thread_uri="at://test/thread/1", 65 36 ) 66 37 67 38 if response.action == "reply": 68 - # Verify memories were stored 69 - memories = await memory.get_user_memories("bob.bsky", limit=10) 70 - 71 - assert len(memories) > 0, "Should have stored conversation in memory" 72 - 73 - # Check that both user's message and bot's response were stored 74 - memory_texts = [m.content for m in memories] 75 - assert any( 76 - "phenomenology" in text.lower() for text in memory_texts 77 - ), "Should store user's message about phenomenology" 39 + await evaluate_response( 40 + "Does the response reference murmuration patterns?", 41 + response.text, 42 + )