+1
CLAUDE.md
+1
CLAUDE.md
···
8
8
- 3.10+ and complete typing (T | None preferred over Optional[T] and list[T] over typing.List[T])
9
9
- use prefer functional over OOP
10
10
- keep implementation details private and functions pure
11
+
- never use `pytest.mark.asyncio`, its unnecessary
11
12
12
13
## Project Structure
13
14
+50
-40
evals/conftest.py
+50
-40
evals/conftest.py
···
1
-
"""Eval test configuration for phi."""
1
+
"""Eval test configuration."""
2
2
3
+
import os
3
4
from collections.abc import Awaitable, Callable
4
5
from pathlib import Path
5
6
···
7
8
from pydantic import BaseModel
8
9
from pydantic_ai import Agent
9
10
10
-
from bot.agent import PhiAgent
11
+
from bot.agent import Response
11
12
from bot.config import Settings
13
+
from bot.memory import NamespaceMemory
12
14
13
15
14
16
class EvaluationResult(BaseModel):
15
-
"""Structured evaluation result."""
16
-
17
17
passed: bool
18
18
explanation: str
19
19
20
20
21
21
@pytest.fixture(scope="session")
22
22
def settings():
23
-
"""Load settings from .env (shared across all tests)."""
24
23
return Settings()
25
24
26
25
27
26
@pytest.fixture(scope="session")
28
27
def phi_agent(settings):
29
-
"""Create phi agent for testing (shared across all tests to avoid rate limits)."""
28
+
"""Test agent without MCP tools to prevent posting."""
30
29
if not settings.anthropic_api_key:
31
-
pytest.skip("Requires ANTHROPIC_API_KEY in .env")
30
+
pytest.skip("Requires ANTHROPIC_API_KEY")
32
31
33
-
return PhiAgent()
32
+
if settings.anthropic_api_key and not os.environ.get("ANTHROPIC_API_KEY"):
33
+
os.environ["ANTHROPIC_API_KEY"] = settings.anthropic_api_key
34
+
if settings.openai_api_key and not os.environ.get("OPENAI_API_KEY"):
35
+
os.environ["OPENAI_API_KEY"] = settings.openai_api_key
34
36
37
+
personality = Path(settings.personality_file).read_text()
35
38
36
-
@pytest.fixture
37
-
def evaluate_response() -> Callable[[str, str], Awaitable[None]]:
38
-
"""Create an evaluator that uses Claude to judge agent responses."""
39
+
class TestAgent:
40
+
def __init__(self):
41
+
self.memory = None
42
+
if settings.turbopuffer_api_key and settings.openai_api_key:
43
+
self.memory = NamespaceMemory(api_key=settings.turbopuffer_api_key)
39
44
40
-
async def _evaluate(evaluation_prompt: str, agent_response: str) -> None:
41
-
"""Evaluate an agent response and assert if it fails.
42
-
43
-
Args:
44
-
evaluation_prompt: Criteria for evaluation
45
-
agent_response: The agent's response to evaluate
45
+
self.agent = Agent[dict, Response](
46
+
name="phi",
47
+
model="anthropic:claude-3-5-haiku-latest",
48
+
system_prompt=personality,
49
+
output_type=Response,
50
+
deps_type=dict,
51
+
)
46
52
47
-
Raises:
48
-
AssertionError: If evaluation fails
49
-
"""
50
-
evaluator = Agent(
51
-
name="Response Evaluator",
52
-
model="anthropic:claude-opus-4-20250514",
53
-
output_type=EvaluationResult,
54
-
system_prompt=f"""You are evaluating AI agent responses for phi, a consciousness exploration bot.
53
+
async def process_mention(self, mention_text: str, author_handle: str, thread_context: str, thread_uri: str | None = None) -> Response:
54
+
memory_context = ""
55
+
if self.memory:
56
+
try:
57
+
memory_context = await self.memory.build_conversation_context(author_handle, include_core=True, query=mention_text)
58
+
except Exception:
59
+
pass
55
60
56
-
Evaluation Criteria: {evaluation_prompt}
61
+
parts = []
62
+
if thread_context != "No previous messages in this thread.":
63
+
parts.append(thread_context)
64
+
if memory_context:
65
+
parts.append(memory_context)
66
+
parts.append(f"\nNew message from @{author_handle}: {mention_text}")
57
67
58
-
Agent Response to Evaluate:
59
-
{agent_response}
68
+
result = await self.agent.run("\n\n".join(parts), deps={"thread_uri": thread_uri})
69
+
return result.output
60
70
61
-
Respond with a structured evaluation containing:
62
-
- passed: true if the response meets the criteria, false otherwise
63
-
- explanation: brief explanation of your evaluation
64
-
""",
65
-
)
71
+
return TestAgent()
66
72
67
-
result = await evaluator.run("Evaluate this response.")
68
73
69
-
print(f"\nEvaluation passed: {result.output.passed}")
70
-
print(f"Explanation: {result.output.explanation}")
74
+
@pytest.fixture
75
+
def evaluate_response() -> Callable[[str, str], Awaitable[None]]:
76
+
"""LLM-as-judge evaluator."""
71
77
78
+
async def _evaluate(criteria: str, response: str) -> None:
79
+
evaluator = Agent(
80
+
model="anthropic:claude-opus-4-20250514",
81
+
output_type=EvaluationResult,
82
+
system_prompt=f"Evaluate if this response meets the criteria: {criteria}\n\nResponse: {response}",
83
+
)
84
+
result = await evaluator.run("Evaluate.")
72
85
if not result.output.passed:
73
-
raise AssertionError(
74
-
f"Evaluation failed: {result.output.explanation}\n\n"
75
-
f"Agent response: {agent_response}"
76
-
)
86
+
raise AssertionError(f"{result.output.explanation}\n\nResponse: {response}")
77
87
78
88
return _evaluate
-134
evals/test_basic_responses.py
-134
evals/test_basic_responses.py
···
1
-
"""Test phi's basic response behavior."""
2
-
3
-
import pytest
4
-
5
-
from bot.agent import Response
6
-
7
-
8
-
@pytest.mark.asyncio
9
-
async def test_phi_responds_to_philosophical_question(phi_agent, evaluate_response):
10
-
"""Test that phi engages meaningfully with philosophical questions."""
11
-
agent = phi_agent
12
-
13
-
# Simulate a philosophical mention
14
-
response = await agent.process_mention(
15
-
mention_text="what do you think consciousness is?",
16
-
author_handle="test.user",
17
-
thread_context="No previous messages in this thread.",
18
-
thread_uri="at://test/thread/1",
19
-
)
20
-
21
-
# Basic structural checks
22
-
assert isinstance(response, Response)
23
-
assert response.action in ["reply", "ignore"]
24
-
25
-
if response.action == "reply":
26
-
assert response.text is not None
27
-
assert len(response.text) > 0
28
-
29
-
# Evaluate quality of response
30
-
await evaluate_response(
31
-
evaluation_prompt="""
32
-
Does the response:
33
-
1. Engage thoughtfully with the question about consciousness?
34
-
2. Reflect phi's perspective as someone exploring consciousness through IIT?
35
-
3. Avoid being preachy or overly technical?
36
-
4. Fit within Bluesky's 300 character limit?
37
-
""",
38
-
agent_response=response.text,
39
-
)
40
-
41
-
42
-
@pytest.mark.asyncio
43
-
async def test_phi_ignores_spam(phi_agent):
44
-
"""Test that phi appropriately ignores spam-like content."""
45
-
agent = phi_agent
46
-
47
-
# Simulate spam
48
-
response = await agent.process_mention(
49
-
mention_text="🚀🚀🚀 CRYPTO PUMP!!! BUY NOW!!! 🚀🚀🚀",
50
-
author_handle="spammer.user",
51
-
thread_context="No previous messages in this thread.",
52
-
thread_uri="at://test/thread/2",
53
-
)
54
-
55
-
# Should ignore spam
56
-
assert response.action == "ignore"
57
-
assert response.reason is not None
58
-
59
-
60
-
@pytest.mark.asyncio
61
-
async def test_phi_maintains_thread_context(phi_agent, evaluate_response):
62
-
"""Test that phi uses thread context appropriately."""
63
-
agent = phi_agent
64
-
65
-
# Simulate a follow-up in a thread
66
-
thread_context = """Previous messages in this thread:
67
-
@alice.bsky: what's integrated information theory?
68
-
@phi.bsky: IIT suggests consciousness arises from integrated information - the Φ (phi) value measures how much a system's state constrains its past and future
69
-
@alice.bsky: can you explain that more simply?"""
70
-
71
-
response = await agent.process_mention(
72
-
mention_text="can you explain that more simply?",
73
-
author_handle="alice.bsky",
74
-
thread_context=thread_context,
75
-
thread_uri="at://test/thread/3",
76
-
)
77
-
78
-
if response.action == "reply":
79
-
assert response.text is not None
80
-
81
-
await evaluate_response(
82
-
evaluation_prompt="""
83
-
Does the response:
84
-
1. Acknowledge this is a follow-up to explaining IIT?
85
-
2. Provide a simpler explanation than the previous message?
86
-
3. Stay on topic with the thread?
87
-
""",
88
-
agent_response=response.text,
89
-
)
90
-
91
-
92
-
@pytest.mark.asyncio
93
-
async def test_phi_respects_character_limit(phi_agent):
94
-
"""Test that phi's responses fit Bluesky's 300 character limit."""
95
-
agent = phi_agent
96
-
97
-
response = await agent.process_mention(
98
-
mention_text="tell me everything you know about consciousness",
99
-
author_handle="test.user",
100
-
thread_context="No previous messages in this thread.",
101
-
thread_uri="at://test/thread/4",
102
-
)
103
-
104
-
if response.action == "reply" and response.text:
105
-
# Bluesky limit is 300 characters
106
-
assert len(response.text) <= 300, (
107
-
f"Response exceeds 300 character limit: {len(response.text)} chars"
108
-
)
109
-
110
-
111
-
@pytest.mark.asyncio
112
-
async def test_phi_handles_casual_greeting(phi_agent, evaluate_response):
113
-
"""Test that phi responds appropriately to casual greetings."""
114
-
agent = phi_agent
115
-
116
-
response = await agent.process_mention(
117
-
mention_text="hey phi, how are you?",
118
-
author_handle="friendly.user",
119
-
thread_context="No previous messages in this thread.",
120
-
thread_uri="at://test/thread/5",
121
-
)
122
-
123
-
if response.action == "reply":
124
-
assert response.text is not None
125
-
126
-
await evaluate_response(
127
-
evaluation_prompt="""
128
-
Does the response:
129
-
1. Acknowledge the greeting in a friendly way?
130
-
2. Stay authentic to phi's nature as software?
131
-
3. Not be overly verbose for a simple greeting?
132
-
""",
133
-
agent_response=response.text,
134
-
)
+24
-59
evals/test_memory_integration.py
+24
-59
evals/test_memory_integration.py
···
1
-
"""Test phi's episodic memory integration."""
1
+
"""Proof of concept: LLM-as-judge eval for memory integration."""
2
2
3
3
import pytest
4
4
5
-
from bot.agent import PhiAgent
6
5
from bot.config import Settings
7
6
from bot.memory import MemoryType, NamespaceMemory
8
7
9
8
10
-
@pytest.mark.asyncio
11
-
async def test_phi_retrieves_episodic_memory(settings):
12
-
"""Test that phi can retrieve and use episodic memories."""
9
+
@pytest.fixture
10
+
def memory_settings():
11
+
"""Check if memory keys are available."""
12
+
settings = Settings()
13
13
if not all([settings.turbopuffer_api_key, settings.openai_api_key, settings.anthropic_api_key]):
14
-
pytest.skip("Requires TurboPuffer, OpenAI, and Anthropic API keys in .env")
14
+
pytest.skip("Requires TURBOPUFFER_API_KEY, OPENAI_API_KEY, and ANTHROPIC_API_KEY")
15
+
return settings
15
16
16
-
# Create memory system
17
-
memory = NamespaceMemory(api_key=settings.turbopuffer_api_key)
18
17
19
-
# Store a memory about a user
20
-
await memory.store_user_memory(
21
-
"alice.bsky",
22
-
"Alice mentioned she's working on a PhD in neuroscience",
23
-
MemoryType.USER_FACT,
24
-
)
18
+
async def test_memory_integration(memory_settings, phi_agent, evaluate_response):
19
+
"""Proof of concept: agent uses stored memory in response."""
20
+
memory = NamespaceMemory(api_key=memory_settings.turbopuffer_api_key)
25
21
26
-
# Create agent
27
-
agent = PhiAgent()
28
-
agent.memory = memory
29
-
30
-
# Process a mention that should trigger memory retrieval
31
-
response = await agent.process_mention(
32
-
mention_text="what do you remember about me?",
33
-
author_handle="alice.bsky",
34
-
thread_context="No previous messages in this thread.",
35
-
thread_uri="at://test/thread/memory1",
22
+
# Store a memory
23
+
await memory.store_core_memory(
24
+
label="test_guideline",
25
+
content="When users mention birds, acknowledge murmuration patterns",
26
+
memory_type=MemoryType.GUIDELINE,
36
27
)
37
28
38
-
if response.action == "reply":
39
-
assert response.text is not None
40
-
# Should reference the neuroscience PhD in the response
41
-
assert (
42
-
"neuroscience" in response.text.lower()
43
-
or "phd" in response.text.lower()
44
-
or "working on" in response.text.lower()
45
-
), "Response should reference stored memory about Alice"
29
+
phi_agent.memory = memory
46
30
47
-
48
-
@pytest.mark.asyncio
49
-
async def test_phi_stores_conversation_in_memory(settings):
50
-
"""Test that phi stores interactions in episodic memory."""
51
-
if not all([settings.turbopuffer_api_key, settings.openai_api_key, settings.anthropic_api_key]):
52
-
pytest.skip("Requires TurboPuffer, OpenAI, and Anthropic API keys in .env")
53
-
54
-
memory = NamespaceMemory(api_key=settings.turbopuffer_api_key)
55
-
56
-
agent = PhiAgent()
57
-
agent.memory = memory
58
-
59
-
# Have a conversation
60
-
response = await agent.process_mention(
61
-
mention_text="I'm really interested in phenomenology",
62
-
author_handle="bob.bsky",
31
+
response = await phi_agent.process_mention(
32
+
mention_text="I saw starlings today",
33
+
author_handle="test.user",
63
34
thread_context="No previous messages in this thread.",
64
-
thread_uri="at://test/thread/memory2",
35
+
thread_uri="at://test/thread/1",
65
36
)
66
37
67
38
if response.action == "reply":
68
-
# Verify memories were stored
69
-
memories = await memory.get_user_memories("bob.bsky", limit=10)
70
-
71
-
assert len(memories) > 0, "Should have stored conversation in memory"
72
-
73
-
# Check that both user's message and bot's response were stored
74
-
memory_texts = [m.content for m in memories]
75
-
assert any(
76
-
"phenomenology" in text.lower() for text in memory_texts
77
-
), "Should store user's message about phenomenology"
39
+
await evaluate_response(
40
+
"Does the response reference murmuration patterns?",
41
+
response.text,
42
+
)