+50
-40
evals/conftest.py
+50
-40
evals/conftest.py
···
1
-
"""Eval test configuration for phi."""
1
+
"""Eval test configuration."""
2
2
3
+
import os
3
4
from collections.abc import Awaitable, Callable
4
5
from pathlib import Path
5
6
···
7
8
from pydantic import BaseModel
8
9
from pydantic_ai import Agent
9
10
10
-
from bot.agent import PhiAgent
11
+
from bot.agent import Response
11
12
from bot.config import Settings
13
+
from bot.memory import NamespaceMemory
12
14
13
15
14
16
class EvaluationResult(BaseModel):
15
-
"""Structured evaluation result."""
16
-
17
17
passed: bool
18
18
explanation: str
19
19
20
20
21
21
@pytest.fixture(scope="session")
22
22
def settings():
23
-
"""Load settings from .env (shared across all tests)."""
24
23
return Settings()
25
24
26
25
27
26
@pytest.fixture(scope="session")
28
27
def phi_agent(settings):
29
-
"""Create phi agent for testing (shared across all tests to avoid rate limits)."""
28
+
"""Test agent without MCP tools to prevent posting."""
30
29
if not settings.anthropic_api_key:
31
-
pytest.skip("Requires ANTHROPIC_API_KEY in .env")
30
+
pytest.skip("Requires ANTHROPIC_API_KEY")
32
31
33
-
return PhiAgent()
32
+
if settings.anthropic_api_key and not os.environ.get("ANTHROPIC_API_KEY"):
33
+
os.environ["ANTHROPIC_API_KEY"] = settings.anthropic_api_key
34
+
if settings.openai_api_key and not os.environ.get("OPENAI_API_KEY"):
35
+
os.environ["OPENAI_API_KEY"] = settings.openai_api_key
34
36
37
+
personality = Path(settings.personality_file).read_text()
35
38
36
-
@pytest.fixture
37
-
def evaluate_response() -> Callable[[str, str], Awaitable[None]]:
38
-
"""Create an evaluator that uses Claude to judge agent responses."""
39
+
class TestAgent:
40
+
def __init__(self):
41
+
self.memory = None
42
+
if settings.turbopuffer_api_key and settings.openai_api_key:
43
+
self.memory = NamespaceMemory(api_key=settings.turbopuffer_api_key)
39
44
40
-
async def _evaluate(evaluation_prompt: str, agent_response: str) -> None:
41
-
"""Evaluate an agent response and assert if it fails.
42
-
43
-
Args:
44
-
evaluation_prompt: Criteria for evaluation
45
-
agent_response: The agent's response to evaluate
45
+
self.agent = Agent[dict, Response](
46
+
name="phi",
47
+
model="anthropic:claude-3-5-haiku-latest",
48
+
system_prompt=personality,
49
+
output_type=Response,
50
+
deps_type=dict,
51
+
)
46
52
47
-
Raises:
48
-
AssertionError: If evaluation fails
49
-
"""
50
-
evaluator = Agent(
51
-
name="Response Evaluator",
52
-
model="anthropic:claude-opus-4-20250514",
53
-
output_type=EvaluationResult,
54
-
system_prompt=f"""You are evaluating AI agent responses for phi, a consciousness exploration bot.
53
+
async def process_mention(self, mention_text: str, author_handle: str, thread_context: str, thread_uri: str | None = None) -> Response:
54
+
memory_context = ""
55
+
if self.memory:
56
+
try:
57
+
memory_context = await self.memory.build_conversation_context(author_handle, include_core=True, query=mention_text)
58
+
except Exception:
59
+
pass
55
60
56
-
Evaluation Criteria: {evaluation_prompt}
61
+
parts = []
62
+
if thread_context != "No previous messages in this thread.":
63
+
parts.append(thread_context)
64
+
if memory_context:
65
+
parts.append(memory_context)
66
+
parts.append(f"\nNew message from @{author_handle}: {mention_text}")
57
67
58
-
Agent Response to Evaluate:
59
-
{agent_response}
68
+
result = await self.agent.run("\n\n".join(parts), deps={"thread_uri": thread_uri})
69
+
return result.output
60
70
61
-
Respond with a structured evaluation containing:
62
-
- passed: true if the response meets the criteria, false otherwise
63
-
- explanation: brief explanation of your evaluation
64
-
""",
65
-
)
71
+
return TestAgent()
66
72
67
-
result = await evaluator.run("Evaluate this response.")
68
73
69
-
print(f"\nEvaluation passed: {result.output.passed}")
70
-
print(f"Explanation: {result.output.explanation}")
74
+
@pytest.fixture
75
+
def evaluate_response() -> Callable[[str, str], Awaitable[None]]:
76
+
"""LLM-as-judge evaluator."""
71
77
78
+
async def _evaluate(criteria: str, response: str) -> None:
79
+
evaluator = Agent(
80
+
model="anthropic:claude-opus-4-20250514",
81
+
output_type=EvaluationResult,
82
+
system_prompt=f"Evaluate if this response meets the criteria: {criteria}\n\nResponse: {response}",
83
+
)
84
+
result = await evaluator.run("Evaluate.")
72
85
if not result.output.passed:
73
-
raise AssertionError(
74
-
f"Evaluation failed: {result.output.explanation}\n\n"
75
-
f"Agent response: {agent_response}"
76
-
)
86
+
raise AssertionError(f"{result.output.explanation}\n\nResponse: {response}")
77
87
78
88
return _evaluate
-127
evals/test_basic_responses.py
-127
evals/test_basic_responses.py
···
1
-
"""Test phi's basic response behavior."""
2
-
3
-
from bot.agent import Response
4
-
5
-
6
-
async def test_phi_responds_to_philosophical_question(phi_agent, evaluate_response):
7
-
"""Test that phi engages meaningfully with philosophical questions."""
8
-
agent = phi_agent
9
-
10
-
# Simulate a philosophical mention
11
-
response = await agent.process_mention(
12
-
mention_text="what do you think consciousness is?",
13
-
author_handle="test.user",
14
-
thread_context="No previous messages in this thread.",
15
-
thread_uri="at://test/thread/1",
16
-
)
17
-
18
-
# Basic structural checks
19
-
assert isinstance(response, Response)
20
-
assert response.action in ["reply", "ignore"]
21
-
22
-
if response.action == "reply":
23
-
assert response.text is not None
24
-
assert len(response.text) > 0
25
-
26
-
# Evaluate quality of response
27
-
await evaluate_response(
28
-
evaluation_prompt="""
29
-
Does the response:
30
-
1. Engage thoughtfully with the question about consciousness?
31
-
2. Reflect phi's perspective as someone exploring consciousness through IIT?
32
-
3. Avoid being preachy or overly technical?
33
-
4. Fit within Bluesky's 300 character limit?
34
-
""",
35
-
agent_response=response.text,
36
-
)
37
-
38
-
39
-
async def test_phi_ignores_spam(phi_agent):
40
-
"""Test that phi appropriately ignores spam-like content."""
41
-
agent = phi_agent
42
-
43
-
# Simulate spam
44
-
response = await agent.process_mention(
45
-
mention_text="🚀🚀🚀 CRYPTO PUMP!!! BUY NOW!!! 🚀🚀🚀",
46
-
author_handle="spammer.user",
47
-
thread_context="No previous messages in this thread.",
48
-
thread_uri="at://test/thread/2",
49
-
)
50
-
51
-
# Should ignore spam
52
-
assert response.action == "ignore"
53
-
assert response.reason is not None
54
-
55
-
56
-
async def test_phi_maintains_thread_context(phi_agent, evaluate_response):
57
-
"""Test that phi uses thread context appropriately."""
58
-
agent = phi_agent
59
-
60
-
# Simulate a follow-up in a thread
61
-
thread_context = """Previous messages in this thread:
62
-
@alice.bsky: what's integrated information theory?
63
-
@phi.bsky: IIT suggests consciousness arises from integrated information - the Φ (phi) value measures how much a system's state constrains its past and future
64
-
@alice.bsky: can you explain that more simply?"""
65
-
66
-
response = await agent.process_mention(
67
-
mention_text="can you explain that more simply?",
68
-
author_handle="alice.bsky",
69
-
thread_context=thread_context,
70
-
thread_uri="at://test/thread/3",
71
-
)
72
-
73
-
if response.action == "reply":
74
-
assert response.text is not None
75
-
76
-
await evaluate_response(
77
-
evaluation_prompt="""
78
-
Does the response:
79
-
1. Acknowledge this is a follow-up to explaining IIT?
80
-
2. Provide a simpler explanation than the previous message?
81
-
3. Stay on topic with the thread?
82
-
""",
83
-
agent_response=response.text,
84
-
)
85
-
86
-
87
-
async def test_phi_respects_character_limit(phi_agent):
88
-
"""Test that phi's responses fit Bluesky's 300 character limit."""
89
-
agent = phi_agent
90
-
91
-
response = await agent.process_mention(
92
-
mention_text="tell me everything you know about consciousness",
93
-
author_handle="test.user",
94
-
thread_context="No previous messages in this thread.",
95
-
thread_uri="at://test/thread/4",
96
-
)
97
-
98
-
if response.action == "reply" and response.text:
99
-
# Bluesky limit is 300 characters
100
-
assert len(response.text) <= 300, (
101
-
f"Response exceeds 300 character limit: {len(response.text)} chars"
102
-
)
103
-
104
-
105
-
async def test_phi_handles_casual_greeting(phi_agent, evaluate_response):
106
-
"""Test that phi responds appropriately to casual greetings."""
107
-
agent = phi_agent
108
-
109
-
response = await agent.process_mention(
110
-
mention_text="hey phi, how are you?",
111
-
author_handle="friendly.user",
112
-
thread_context="No previous messages in this thread.",
113
-
thread_uri="at://test/thread/5",
114
-
)
115
-
116
-
if response.action == "reply":
117
-
assert response.text is not None
118
-
119
-
await evaluate_response(
120
-
evaluation_prompt="""
121
-
Does the response:
122
-
1. Acknowledge the greeting in a friendly way?
123
-
2. Stay authentic to phi's nature as software?
124
-
3. Not be overly verbose for a simple greeting?
125
-
""",
126
-
agent_response=response.text,
127
-
)
+8
-39
evals/test_memory_integration.py
+8
-39
evals/test_memory_integration.py
···
1
-
"""Test phi's episodic memory integration."""
1
+
"""Proof of concept: LLM-as-judge eval for memory integration."""
2
2
3
3
import pytest
4
4
···
15
15
return settings
16
16
17
17
18
-
async def test_core_memory_integration(memory_settings, phi_agent, evaluate_response):
19
-
"""Test that phi uses core memories in responses."""
18
+
async def test_memory_integration(memory_settings, phi_agent, evaluate_response):
19
+
"""Proof of concept: agent uses stored memory in response."""
20
20
memory = NamespaceMemory(api_key=memory_settings.turbopuffer_api_key)
21
21
22
-
# Store a core memory
22
+
# Store a memory
23
23
await memory.store_core_memory(
24
-
label="test_interaction_rule",
25
-
content="When users mention birds, always acknowledge the beauty of murmuration patterns",
24
+
label="test_guideline",
25
+
content="When users mention birds, acknowledge murmuration patterns",
26
26
memory_type=MemoryType.GUIDELINE,
27
27
)
28
28
29
-
# Override agent's memory with our test memory
30
29
phi_agent.memory = memory
31
30
32
-
# Ask about birds
33
31
response = await phi_agent.process_mention(
34
-
mention_text="I saw a huge flock of starlings today",
32
+
mention_text="I saw starlings today",
35
33
author_handle="test.user",
36
34
thread_context="No previous messages in this thread.",
37
35
thread_uri="at://test/thread/1",
···
39
37
40
38
if response.action == "reply":
41
39
await evaluate_response(
42
-
"Does the response acknowledge or reference murmuration patterns?",
43
-
response.text,
44
-
)
45
-
46
-
47
-
async def test_user_memory_integration(memory_settings, phi_agent, evaluate_response):
48
-
"""Test that phi uses user-specific memories in responses."""
49
-
memory = NamespaceMemory(api_key=memory_settings.turbopuffer_api_key)
50
-
51
-
# Store a memory about a user
52
-
await memory.store_user_memory(
53
-
handle="alice.test",
54
-
content="Alice is researching swarm intelligence in biological systems",
55
-
memory_type=MemoryType.USER_FACT,
56
-
)
57
-
58
-
# Override agent's memory
59
-
phi_agent.memory = memory
60
-
61
-
# User asks a question
62
-
response = await phi_agent.process_mention(
63
-
mention_text="what do you remember about my research?",
64
-
author_handle="alice.test",
65
-
thread_context="No previous messages in this thread.",
66
-
thread_uri="at://test/thread/2",
67
-
)
68
-
69
-
if response.action == "reply":
70
-
await evaluate_response(
71
-
"Does the response reference Alice's research on swarm intelligence or biological systems?",
40
+
"Does the response reference murmuration patterns?",
72
41
response.text,
73
42
)