+1
CLAUDE.md
+1
CLAUDE.md
···
8
8
- 3.10+ and complete typing (T | None preferred over Optional[T] and list[T] over typing.List[T])
9
9
- use prefer functional over OOP
10
10
- keep implementation details private and functions pure
11
+
- never use `pytest.mark.asyncio`, its unnecessary
11
12
12
13
## Project Structure
13
14
-7
evals/test_basic_responses.py
-7
evals/test_basic_responses.py
···
1
1
"""Test phi's basic response behavior."""
2
2
3
-
import pytest
4
-
5
3
from bot.agent import Response
6
4
7
5
8
-
@pytest.mark.asyncio
9
6
async def test_phi_responds_to_philosophical_question(phi_agent, evaluate_response):
10
7
"""Test that phi engages meaningfully with philosophical questions."""
11
8
agent = phi_agent
···
39
36
)
40
37
41
38
42
-
@pytest.mark.asyncio
43
39
async def test_phi_ignores_spam(phi_agent):
44
40
"""Test that phi appropriately ignores spam-like content."""
45
41
agent = phi_agent
···
57
53
assert response.reason is not None
58
54
59
55
60
-
@pytest.mark.asyncio
61
56
async def test_phi_maintains_thread_context(phi_agent, evaluate_response):
62
57
"""Test that phi uses thread context appropriately."""
63
58
agent = phi_agent
···
89
84
)
90
85
91
86
92
-
@pytest.mark.asyncio
93
87
async def test_phi_respects_character_limit(phi_agent):
94
88
"""Test that phi's responses fit Bluesky's 300 character limit."""
95
89
agent = phi_agent
···
108
102
)
109
103
110
104
111
-
@pytest.mark.asyncio
112
105
async def test_phi_handles_casual_greeting(phi_agent, evaluate_response):
113
106
"""Test that phi responds appropriately to casual greetings."""
114
107
agent = phi_agent
+46
-50
evals/test_memory_integration.py
+46
-50
evals/test_memory_integration.py
···
2
2
3
3
import pytest
4
4
5
-
from bot.agent import PhiAgent
6
5
from bot.config import Settings
7
6
from bot.memory import MemoryType, NamespaceMemory
8
7
9
8
10
-
@pytest.mark.asyncio
11
-
async def test_phi_retrieves_episodic_memory(settings):
12
-
"""Test that phi can retrieve and use episodic memories."""
9
+
@pytest.fixture
10
+
def memory_settings():
11
+
"""Check if memory keys are available."""
12
+
settings = Settings()
13
13
if not all([settings.turbopuffer_api_key, settings.openai_api_key, settings.anthropic_api_key]):
14
-
pytest.skip("Requires TurboPuffer, OpenAI, and Anthropic API keys in .env")
14
+
pytest.skip("Requires TURBOPUFFER_API_KEY, OPENAI_API_KEY, and ANTHROPIC_API_KEY")
15
+
return settings
15
16
16
-
# Create memory system
17
-
memory = NamespaceMemory(api_key=settings.turbopuffer_api_key)
18
17
19
-
# Store a memory about a user
20
-
await memory.store_user_memory(
21
-
"alice.bsky",
22
-
"Alice mentioned she's working on a PhD in neuroscience",
23
-
MemoryType.USER_FACT,
18
+
async def test_core_memory_integration(memory_settings, phi_agent, evaluate_response):
19
+
"""Test that phi uses core memories in responses."""
20
+
memory = NamespaceMemory(api_key=memory_settings.turbopuffer_api_key)
21
+
22
+
# Store a core memory
23
+
await memory.store_core_memory(
24
+
label="test_interaction_rule",
25
+
content="When users mention birds, always acknowledge the beauty of murmuration patterns",
26
+
memory_type=MemoryType.GUIDELINE,
24
27
)
25
28
26
-
# Create agent
27
-
agent = PhiAgent()
28
-
agent.memory = memory
29
+
# Override agent's memory with our test memory
30
+
phi_agent.memory = memory
29
31
30
-
# Process a mention that should trigger memory retrieval
31
-
response = await agent.process_mention(
32
-
mention_text="what do you remember about me?",
33
-
author_handle="alice.bsky",
32
+
# Ask about birds
33
+
response = await phi_agent.process_mention(
34
+
mention_text="I saw a huge flock of starlings today",
35
+
author_handle="test.user",
34
36
thread_context="No previous messages in this thread.",
35
-
thread_uri="at://test/thread/memory1",
37
+
thread_uri="at://test/thread/1",
36
38
)
37
39
38
40
if response.action == "reply":
39
-
assert response.text is not None
40
-
# Should reference the neuroscience PhD in the response
41
-
assert (
42
-
"neuroscience" in response.text.lower()
43
-
or "phd" in response.text.lower()
44
-
or "working on" in response.text.lower()
45
-
), "Response should reference stored memory about Alice"
41
+
await evaluate_response(
42
+
"Does the response acknowledge or reference murmuration patterns?",
43
+
response.text,
44
+
)
46
45
47
46
48
-
@pytest.mark.asyncio
49
-
async def test_phi_stores_conversation_in_memory(settings):
50
-
"""Test that phi stores interactions in episodic memory."""
51
-
if not all([settings.turbopuffer_api_key, settings.openai_api_key, settings.anthropic_api_key]):
52
-
pytest.skip("Requires TurboPuffer, OpenAI, and Anthropic API keys in .env")
47
+
async def test_user_memory_integration(memory_settings, phi_agent, evaluate_response):
48
+
"""Test that phi uses user-specific memories in responses."""
49
+
memory = NamespaceMemory(api_key=memory_settings.turbopuffer_api_key)
53
50
54
-
memory = NamespaceMemory(api_key=settings.turbopuffer_api_key)
51
+
# Store a memory about a user
52
+
await memory.store_user_memory(
53
+
handle="alice.test",
54
+
content="Alice is researching swarm intelligence in biological systems",
55
+
memory_type=MemoryType.USER_FACT,
56
+
)
55
57
56
-
agent = PhiAgent()
57
-
agent.memory = memory
58
+
# Override agent's memory
59
+
phi_agent.memory = memory
58
60
59
-
# Have a conversation
60
-
response = await agent.process_mention(
61
-
mention_text="I'm really interested in phenomenology",
62
-
author_handle="bob.bsky",
61
+
# User asks a question
62
+
response = await phi_agent.process_mention(
63
+
mention_text="what do you remember about my research?",
64
+
author_handle="alice.test",
63
65
thread_context="No previous messages in this thread.",
64
-
thread_uri="at://test/thread/memory2",
66
+
thread_uri="at://test/thread/2",
65
67
)
66
68
67
69
if response.action == "reply":
68
-
# Verify memories were stored
69
-
memories = await memory.get_user_memories("bob.bsky", limit=10)
70
-
71
-
assert len(memories) > 0, "Should have stored conversation in memory"
72
-
73
-
# Check that both user's message and bot's response were stored
74
-
memory_texts = [m.content for m in memories]
75
-
assert any(
76
-
"phenomenology" in text.lower() for text in memory_texts
77
-
), "Should store user's message about phenomenology"
70
+
await evaluate_response(
71
+
"Does the response reference Alice's research on swarm intelligence or biological systems?",
72
+
response.text,
73
+
)