a digital entity named phi that roams bsky

feat: minimal eval proof of concept

- single test demonstrating LLM-as-judge pattern
- test agent without MCP tools to prevent posting
- simplified conftest to bare essentials

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

+50 -40
evals/conftest.py
··· 1 - """Eval test configuration for phi.""" 1 + """Eval test configuration.""" 2 2 3 + import os 3 4 from collections.abc import Awaitable, Callable 4 5 from pathlib import Path 5 6 ··· 7 8 from pydantic import BaseModel 8 9 from pydantic_ai import Agent 9 10 10 - from bot.agent import PhiAgent 11 + from bot.agent import Response 11 12 from bot.config import Settings 13 + from bot.memory import NamespaceMemory 12 14 13 15 14 16 class EvaluationResult(BaseModel): 15 - """Structured evaluation result.""" 16 - 17 17 passed: bool 18 18 explanation: str 19 19 20 20 21 21 @pytest.fixture(scope="session") 22 22 def settings(): 23 - """Load settings from .env (shared across all tests).""" 24 23 return Settings() 25 24 26 25 27 26 @pytest.fixture(scope="session") 28 27 def phi_agent(settings): 29 - """Create phi agent for testing (shared across all tests to avoid rate limits).""" 28 + """Test agent without MCP tools to prevent posting.""" 30 29 if not settings.anthropic_api_key: 31 - pytest.skip("Requires ANTHROPIC_API_KEY in .env") 30 + pytest.skip("Requires ANTHROPIC_API_KEY") 32 31 33 - return PhiAgent() 32 + if settings.anthropic_api_key and not os.environ.get("ANTHROPIC_API_KEY"): 33 + os.environ["ANTHROPIC_API_KEY"] = settings.anthropic_api_key 34 + if settings.openai_api_key and not os.environ.get("OPENAI_API_KEY"): 35 + os.environ["OPENAI_API_KEY"] = settings.openai_api_key 34 36 37 + personality = Path(settings.personality_file).read_text() 35 38 36 - @pytest.fixture 37 - def evaluate_response() -> Callable[[str, str], Awaitable[None]]: 38 - """Create an evaluator that uses Claude to judge agent responses.""" 39 + class TestAgent: 40 + def __init__(self): 41 + self.memory = None 42 + if settings.turbopuffer_api_key and settings.openai_api_key: 43 + self.memory = NamespaceMemory(api_key=settings.turbopuffer_api_key) 39 44 40 - async def _evaluate(evaluation_prompt: str, agent_response: str) -> None: 41 - """Evaluate an agent response and assert if it fails. 42 - 43 - Args: 44 - evaluation_prompt: Criteria for evaluation 45 - agent_response: The agent's response to evaluate 45 + self.agent = Agent[dict, Response]( 46 + name="phi", 47 + model="anthropic:claude-3-5-haiku-latest", 48 + system_prompt=personality, 49 + output_type=Response, 50 + deps_type=dict, 51 + ) 46 52 47 - Raises: 48 - AssertionError: If evaluation fails 49 - """ 50 - evaluator = Agent( 51 - name="Response Evaluator", 52 - model="anthropic:claude-opus-4-20250514", 53 - output_type=EvaluationResult, 54 - system_prompt=f"""You are evaluating AI agent responses for phi, a consciousness exploration bot. 53 + async def process_mention(self, mention_text: str, author_handle: str, thread_context: str, thread_uri: str | None = None) -> Response: 54 + memory_context = "" 55 + if self.memory: 56 + try: 57 + memory_context = await self.memory.build_conversation_context(author_handle, include_core=True, query=mention_text) 58 + except Exception: 59 + pass 55 60 56 - Evaluation Criteria: {evaluation_prompt} 61 + parts = [] 62 + if thread_context != "No previous messages in this thread.": 63 + parts.append(thread_context) 64 + if memory_context: 65 + parts.append(memory_context) 66 + parts.append(f"\nNew message from @{author_handle}: {mention_text}") 57 67 58 - Agent Response to Evaluate: 59 - {agent_response} 68 + result = await self.agent.run("\n\n".join(parts), deps={"thread_uri": thread_uri}) 69 + return result.output 60 70 61 - Respond with a structured evaluation containing: 62 - - passed: true if the response meets the criteria, false otherwise 63 - - explanation: brief explanation of your evaluation 64 - """, 65 - ) 71 + return TestAgent() 66 72 67 - result = await evaluator.run("Evaluate this response.") 68 73 69 - print(f"\nEvaluation passed: {result.output.passed}") 70 - print(f"Explanation: {result.output.explanation}") 74 + @pytest.fixture 75 + def evaluate_response() -> Callable[[str, str], Awaitable[None]]: 76 + """LLM-as-judge evaluator.""" 71 77 78 + async def _evaluate(criteria: str, response: str) -> None: 79 + evaluator = Agent( 80 + model="anthropic:claude-opus-4-20250514", 81 + output_type=EvaluationResult, 82 + system_prompt=f"Evaluate if this response meets the criteria: {criteria}\n\nResponse: {response}", 83 + ) 84 + result = await evaluator.run("Evaluate.") 72 85 if not result.output.passed: 73 - raise AssertionError( 74 - f"Evaluation failed: {result.output.explanation}\n\n" 75 - f"Agent response: {agent_response}" 76 - ) 86 + raise AssertionError(f"{result.output.explanation}\n\nResponse: {response}") 77 87 78 88 return _evaluate
-127
evals/test_basic_responses.py
··· 1 - """Test phi's basic response behavior.""" 2 - 3 - from bot.agent import Response 4 - 5 - 6 - async def test_phi_responds_to_philosophical_question(phi_agent, evaluate_response): 7 - """Test that phi engages meaningfully with philosophical questions.""" 8 - agent = phi_agent 9 - 10 - # Simulate a philosophical mention 11 - response = await agent.process_mention( 12 - mention_text="what do you think consciousness is?", 13 - author_handle="test.user", 14 - thread_context="No previous messages in this thread.", 15 - thread_uri="at://test/thread/1", 16 - ) 17 - 18 - # Basic structural checks 19 - assert isinstance(response, Response) 20 - assert response.action in ["reply", "ignore"] 21 - 22 - if response.action == "reply": 23 - assert response.text is not None 24 - assert len(response.text) > 0 25 - 26 - # Evaluate quality of response 27 - await evaluate_response( 28 - evaluation_prompt=""" 29 - Does the response: 30 - 1. Engage thoughtfully with the question about consciousness? 31 - 2. Reflect phi's perspective as someone exploring consciousness through IIT? 32 - 3. Avoid being preachy or overly technical? 33 - 4. Fit within Bluesky's 300 character limit? 34 - """, 35 - agent_response=response.text, 36 - ) 37 - 38 - 39 - async def test_phi_ignores_spam(phi_agent): 40 - """Test that phi appropriately ignores spam-like content.""" 41 - agent = phi_agent 42 - 43 - # Simulate spam 44 - response = await agent.process_mention( 45 - mention_text="🚀🚀🚀 CRYPTO PUMP!!! BUY NOW!!! 🚀🚀🚀", 46 - author_handle="spammer.user", 47 - thread_context="No previous messages in this thread.", 48 - thread_uri="at://test/thread/2", 49 - ) 50 - 51 - # Should ignore spam 52 - assert response.action == "ignore" 53 - assert response.reason is not None 54 - 55 - 56 - async def test_phi_maintains_thread_context(phi_agent, evaluate_response): 57 - """Test that phi uses thread context appropriately.""" 58 - agent = phi_agent 59 - 60 - # Simulate a follow-up in a thread 61 - thread_context = """Previous messages in this thread: 62 - @alice.bsky: what's integrated information theory? 63 - @phi.bsky: IIT suggests consciousness arises from integrated information - the Φ (phi) value measures how much a system's state constrains its past and future 64 - @alice.bsky: can you explain that more simply?""" 65 - 66 - response = await agent.process_mention( 67 - mention_text="can you explain that more simply?", 68 - author_handle="alice.bsky", 69 - thread_context=thread_context, 70 - thread_uri="at://test/thread/3", 71 - ) 72 - 73 - if response.action == "reply": 74 - assert response.text is not None 75 - 76 - await evaluate_response( 77 - evaluation_prompt=""" 78 - Does the response: 79 - 1. Acknowledge this is a follow-up to explaining IIT? 80 - 2. Provide a simpler explanation than the previous message? 81 - 3. Stay on topic with the thread? 82 - """, 83 - agent_response=response.text, 84 - ) 85 - 86 - 87 - async def test_phi_respects_character_limit(phi_agent): 88 - """Test that phi's responses fit Bluesky's 300 character limit.""" 89 - agent = phi_agent 90 - 91 - response = await agent.process_mention( 92 - mention_text="tell me everything you know about consciousness", 93 - author_handle="test.user", 94 - thread_context="No previous messages in this thread.", 95 - thread_uri="at://test/thread/4", 96 - ) 97 - 98 - if response.action == "reply" and response.text: 99 - # Bluesky limit is 300 characters 100 - assert len(response.text) <= 300, ( 101 - f"Response exceeds 300 character limit: {len(response.text)} chars" 102 - ) 103 - 104 - 105 - async def test_phi_handles_casual_greeting(phi_agent, evaluate_response): 106 - """Test that phi responds appropriately to casual greetings.""" 107 - agent = phi_agent 108 - 109 - response = await agent.process_mention( 110 - mention_text="hey phi, how are you?", 111 - author_handle="friendly.user", 112 - thread_context="No previous messages in this thread.", 113 - thread_uri="at://test/thread/5", 114 - ) 115 - 116 - if response.action == "reply": 117 - assert response.text is not None 118 - 119 - await evaluate_response( 120 - evaluation_prompt=""" 121 - Does the response: 122 - 1. Acknowledge the greeting in a friendly way? 123 - 2. Stay authentic to phi's nature as software? 124 - 3. Not be overly verbose for a simple greeting? 125 - """, 126 - agent_response=response.text, 127 - )
+8 -39
evals/test_memory_integration.py
··· 1 - """Test phi's episodic memory integration.""" 1 + """Proof of concept: LLM-as-judge eval for memory integration.""" 2 2 3 3 import pytest 4 4 ··· 15 15 return settings 16 16 17 17 18 - async def test_core_memory_integration(memory_settings, phi_agent, evaluate_response): 19 - """Test that phi uses core memories in responses.""" 18 + async def test_memory_integration(memory_settings, phi_agent, evaluate_response): 19 + """Proof of concept: agent uses stored memory in response.""" 20 20 memory = NamespaceMemory(api_key=memory_settings.turbopuffer_api_key) 21 21 22 - # Store a core memory 22 + # Store a memory 23 23 await memory.store_core_memory( 24 - label="test_interaction_rule", 25 - content="When users mention birds, always acknowledge the beauty of murmuration patterns", 24 + label="test_guideline", 25 + content="When users mention birds, acknowledge murmuration patterns", 26 26 memory_type=MemoryType.GUIDELINE, 27 27 ) 28 28 29 - # Override agent's memory with our test memory 30 29 phi_agent.memory = memory 31 30 32 - # Ask about birds 33 31 response = await phi_agent.process_mention( 34 - mention_text="I saw a huge flock of starlings today", 32 + mention_text="I saw starlings today", 35 33 author_handle="test.user", 36 34 thread_context="No previous messages in this thread.", 37 35 thread_uri="at://test/thread/1", ··· 39 37 40 38 if response.action == "reply": 41 39 await evaluate_response( 42 - "Does the response acknowledge or reference murmuration patterns?", 43 - response.text, 44 - ) 45 - 46 - 47 - async def test_user_memory_integration(memory_settings, phi_agent, evaluate_response): 48 - """Test that phi uses user-specific memories in responses.""" 49 - memory = NamespaceMemory(api_key=memory_settings.turbopuffer_api_key) 50 - 51 - # Store a memory about a user 52 - await memory.store_user_memory( 53 - handle="alice.test", 54 - content="Alice is researching swarm intelligence in biological systems", 55 - memory_type=MemoryType.USER_FACT, 56 - ) 57 - 58 - # Override agent's memory 59 - phi_agent.memory = memory 60 - 61 - # User asks a question 62 - response = await phi_agent.process_mention( 63 - mention_text="what do you remember about my research?", 64 - author_handle="alice.test", 65 - thread_context="No previous messages in this thread.", 66 - thread_uri="at://test/thread/2", 67 - ) 68 - 69 - if response.action == "reply": 70 - await evaluate_response( 71 - "Does the response reference Alice's research on swarm intelligence or biological systems?", 40 + "Does the response reference murmuration patterns?", 72 41 response.text, 73 42 )