personal memory agent
1# SPDX-License-Identifier: AGPL-3.0-only
2# Copyright (c) 2026 sol pbc
3
4"""Tests for observe.hear.load_transcript() function."""
5
6import json
7import tempfile
8from pathlib import Path
9
10from observe.hear import load_transcript
11
12
13def test_load_transcript_native_with_metadata():
14 """Test loading native transcript with topics/setting metadata."""
15 with tempfile.TemporaryDirectory() as tmpdir:
16 file_path = Path(tmpdir) / "120000_audio.jsonl"
17
18 lines = [
19 json.dumps({"topics": "meeting, standup", "setting": "work"}),
20 json.dumps({"start": "12:00:01", "source": "mic", "text": "Hello"}),
21 json.dumps({"start": "12:00:05", "source": "sys", "text": "Hi there"}),
22 ]
23 file_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
24
25 metadata, entries, formatted_text = load_transcript(file_path)
26
27 assert entries is not None
28 assert metadata["topics"] == "meeting, standup"
29 assert metadata["setting"] == "work"
30 assert len(entries) == 2
31 assert entries[0]["start"] == "12:00:01"
32 assert entries[0]["text"] == "Hello"
33 assert entries[1]["start"] == "12:00:05"
34 assert entries[1]["text"] == "Hi there"
35
36
37def test_load_transcript_native_empty_metadata():
38 """Test loading native transcript with empty metadata."""
39 with tempfile.TemporaryDirectory() as tmpdir:
40 file_path = Path(tmpdir) / "120000_audio.jsonl"
41
42 lines = [
43 json.dumps({}),
44 json.dumps({"start": "12:00:01", "source": "mic", "text": "Test"}),
45 ]
46 file_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
47
48 metadata, entries, formatted_text = load_transcript(file_path)
49
50 assert entries is not None
51 assert metadata == {}
52 assert len(entries) == 1
53 assert entries[0]["text"] == "Test"
54
55
56def test_load_transcript_imported():
57 """Test loading imported transcript with imported metadata."""
58 with tempfile.TemporaryDirectory() as tmpdir:
59 file_path = Path(tmpdir) / "120000_imported_audio.jsonl"
60
61 lines = [
62 json.dumps({"imported": {"id": "20240101_120000", "facet": "personal"}}),
63 json.dumps({"start": "12:00:01", "text": "Imported entry"}),
64 ]
65 file_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
66
67 metadata, entries, formatted_text = load_transcript(file_path)
68
69 assert entries is not None
70 assert "imported" in metadata
71 assert metadata["imported"]["id"] == "20240101_120000"
72 assert metadata["imported"]["facet"] == "personal"
73 assert len(entries) == 1
74 assert entries[0]["text"] == "Imported entry"
75
76
77def test_load_transcript_empty_file():
78 """Test loading an empty file returns error."""
79 with tempfile.TemporaryDirectory() as tmpdir:
80 file_path = Path(tmpdir) / "empty.jsonl"
81 file_path.write_text("", encoding="utf-8")
82
83 metadata, entries, formatted_text = load_transcript(file_path)
84
85 assert entries is None
86 assert "error" in metadata
87 assert "empty" in metadata["error"].lower()
88
89
90def test_load_transcript_file_not_found():
91 """Test loading non-existent file returns error."""
92 metadata, entries, formatted_text = load_transcript("/nonexistent/file.jsonl")
93
94 assert entries is None
95 assert "error" in metadata
96 assert "not found" in metadata["error"].lower()
97
98
99def test_load_transcript_invalid_metadata_json():
100 """Test loading file with invalid JSON in metadata line."""
101 with tempfile.TemporaryDirectory() as tmpdir:
102 file_path = Path(tmpdir) / "bad_metadata.jsonl"
103 file_path.write_text("not valid json\n", encoding="utf-8")
104
105 metadata, entries, formatted_text = load_transcript(file_path)
106
107 assert entries is None
108 assert "error" in metadata
109 assert "metadata" in metadata["error"].lower()
110
111
112def test_load_transcript_invalid_entry_json():
113 """Test loading file with invalid JSON in entry line."""
114 with tempfile.TemporaryDirectory() as tmpdir:
115 file_path = Path(tmpdir) / "bad_entry.jsonl"
116
117 lines = [
118 json.dumps({}),
119 "not valid json",
120 ]
121 file_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
122
123 metadata, entries, formatted_text = load_transcript(file_path)
124
125 assert entries is None
126 assert "error" in metadata
127 assert "line 2" in metadata["error"].lower()
128
129
130def test_load_transcript_metadata_not_dict():
131 """Test loading file where metadata is not a dict."""
132 with tempfile.TemporaryDirectory() as tmpdir:
133 file_path = Path(tmpdir) / "bad_metadata_type.jsonl"
134 file_path.write_text('["not", "a", "dict"]\n', encoding="utf-8")
135
136 metadata, entries, formatted_text = load_transcript(file_path)
137
138 assert entries is None
139 assert "error" in metadata
140 assert "object" in metadata["error"].lower()
141
142
143def test_load_transcript_entry_not_dict():
144 """Test loading file where entry is not a dict."""
145 with tempfile.TemporaryDirectory() as tmpdir:
146 file_path = Path(tmpdir) / "bad_entry_type.jsonl"
147
148 lines = [
149 json.dumps({}),
150 '"string entry"',
151 ]
152 file_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
153
154 metadata, entries, formatted_text = load_transcript(file_path)
155
156 assert entries is None
157 assert "error" in metadata
158 assert "line 2" in metadata["error"].lower()
159
160
161def test_load_transcript_blank_lines_ignored():
162 """Test that blank lines between entries are ignored."""
163 with tempfile.TemporaryDirectory() as tmpdir:
164 file_path = Path(tmpdir) / "with_blanks.jsonl"
165
166 lines = [
167 json.dumps({}),
168 "",
169 json.dumps({"start": "12:00:01", "text": "First"}),
170 "",
171 "",
172 json.dumps({"start": "12:00:02", "text": "Second"}),
173 "",
174 ]
175 file_path.write_text("\n".join(lines), encoding="utf-8")
176
177 metadata, entries, formatted_text = load_transcript(file_path)
178
179 assert entries is not None
180 assert len(entries) == 2
181 assert entries[0]["text"] == "First"
182 assert entries[1]["text"] == "Second"
183
184
185def test_load_transcript_only_metadata_no_entries():
186 """Test loading file with only metadata line and no entries."""
187 with tempfile.TemporaryDirectory() as tmpdir:
188 file_path = Path(tmpdir) / "only_metadata.jsonl"
189 file_path.write_text(json.dumps({"topics": "test"}) + "\n", encoding="utf-8")
190
191 metadata, entries, formatted_text = load_transcript(file_path)
192
193 assert entries is not None
194 assert metadata["topics"] == "test"
195 assert entries == []
196
197
198def test_load_transcript_with_path_object():
199 """Test that function accepts Path objects."""
200 with tempfile.TemporaryDirectory() as tmpdir:
201 file_path = Path(tmpdir) / "test.jsonl"
202
203 lines = [
204 json.dumps({}),
205 json.dumps({"start": "12:00:01", "text": "Test"}),
206 ]
207 file_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
208
209 # Pass as Path object
210 metadata, entries, formatted_text = load_transcript(file_path)
211
212 assert entries is not None
213 assert len(entries) == 1
214
215
216def test_load_transcript_with_string_path():
217 """Test that function accepts string paths."""
218 with tempfile.TemporaryDirectory() as tmpdir:
219 file_path = Path(tmpdir) / "test.jsonl"
220
221 lines = [
222 json.dumps({}),
223 json.dumps({"start": "12:00:01", "text": "Test"}),
224 ]
225 file_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
226
227 # Pass as string
228 metadata, entries, formatted_text = load_transcript(str(file_path))
229
230 assert entries is not None
231 assert len(entries) == 1
232
233
234def test_load_transcript_all_fields():
235 """Test that all entry fields are preserved."""
236 with tempfile.TemporaryDirectory() as tmpdir:
237 file_path = Path(tmpdir) / "complete.jsonl"
238
239 lines = [
240 json.dumps({}),
241 json.dumps(
242 {
243 "start": "12:00:01",
244 "source": "mic",
245 "speaker": 1,
246 "text": "Complete entry",
247 "description": "confident",
248 }
249 ),
250 ]
251 file_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
252
253 metadata, entries, formatted_text = load_transcript(file_path)
254
255 assert entries is not None
256 assert len(entries) == 1
257 entry = entries[0]
258 assert entry["start"] == "12:00:01"
259 assert entry["source"] == "mic"
260 assert entry["speaker"] == 1
261 assert entry["text"] == "Complete entry"
262 assert entry["description"] == "confident"
263
264
265def test_load_transcript_formatted_text_basic():
266 """Test formatted text output with metadata and entries."""
267 with tempfile.TemporaryDirectory() as tmpdir:
268 # Create test transcript file in YYYYMMDD/HHMMSS_LEN segment
269 day_dir = Path(tmpdir) / "20250615"
270 day_dir.mkdir()
271 segment_dir = day_dir / "100500_300"
272 segment_dir.mkdir()
273 file_path = segment_dir / "audio.jsonl"
274
275 # Write JSONL with metadata and entries
276 metadata = {"topics": ["meeting", "planning"], "setting": "work"}
277 entries = [
278 {"start": "00:01:23", "source": "mic", "speaker": 1, "text": "Hello world"},
279 {"start": "00:01:25", "source": "sys", "speaker": 2, "text": "Hi there"},
280 {
281 "start": "00:01:30",
282 "source": "mic",
283 "speaker": 1,
284 "text": "How are you?",
285 },
286 ]
287
288 lines = [json.dumps(metadata)]
289 lines.extend(json.dumps(entry) for entry in entries)
290 file_path.write_text("\n".join(lines) + "\n")
291
292 # Load transcript
293 metadata_out, entries_out, formatted_text = load_transcript(file_path)
294
295 # Verify formatted text output
296 assert "Start: 2025-06-15 10:05am" in formatted_text
297 assert "Topics: meeting, planning" in formatted_text
298 assert "Setting: work" in formatted_text
299 assert "[00:01:23] (mic) Speaker 1: Hello world" in formatted_text
300 assert "[00:01:25] (sys) Speaker 2: Hi there" in formatted_text
301 assert "[00:01:30] (mic) Speaker 1: How are you?" in formatted_text
302
303
304def test_load_transcript_formatted_text_minimal():
305 """Test formatted text with minimal metadata."""
306 with tempfile.TemporaryDirectory() as tmpdir:
307 day_dir = Path(tmpdir) / "20250615"
308 day_dir.mkdir()
309 segment_dir = day_dir / "100500_300"
310 segment_dir.mkdir()
311 file_path = segment_dir / "audio.jsonl"
312
313 # Minimal metadata (empty dict)
314 metadata = {}
315 entries = [
316 {"start": "00:00:01", "text": "Simple text"},
317 {"start": "00:00:05", "source": "mic", "text": "With source"},
318 ]
319
320 lines = [json.dumps(metadata)]
321 lines.extend(json.dumps(entry) for entry in entries)
322 file_path.write_text("\n".join(lines) + "\n")
323
324 metadata_out, entries_out, formatted_text = load_transcript(file_path)
325
326 assert "Start: 2025-06-15 10:05am" in formatted_text
327 assert "[00:00:01] Simple text" in formatted_text
328 assert "[00:00:05] (mic) With source" in formatted_text
329
330
331def test_load_transcript_formatted_text_imported():
332 """Test formatted text of imported transcript with facet metadata."""
333 with tempfile.TemporaryDirectory() as tmpdir:
334 day_dir = Path(tmpdir) / "20250615"
335 day_dir.mkdir()
336 segment_dir = day_dir / "100500_300"
337 segment_dir.mkdir()
338 file_path = segment_dir / "imported_audio.jsonl"
339
340 metadata = {
341 "imported": {"id": "abc123", "facet": "uavionix"},
342 "topics": ["discussion"],
343 }
344 entries = [
345 {
346 "start": "00:00:10",
347 "source": "mic",
348 "speaker": 1,
349 "text": "Imported content",
350 }
351 ]
352
353 lines = [json.dumps(metadata)]
354 lines.extend(json.dumps(entry) for entry in entries)
355 file_path.write_text("\n".join(lines) + "\n")
356
357 metadata_out, entries_out, formatted_text = load_transcript(file_path)
358
359 assert "Start: 2025-06-15 10:05am" in formatted_text
360 assert "Topics: discussion" in formatted_text
361 assert "Facet: uavionix" in formatted_text
362 assert "Import ID: abc123" in formatted_text
363 assert "[00:00:10] (mic) Speaker 1: Imported content" in formatted_text
364
365
366def test_load_transcript_formatted_text_no_speaker():
367 """Test formatted text for entries without speaker information."""
368 with tempfile.TemporaryDirectory() as tmpdir:
369 day_dir = Path(tmpdir) / "20250615"
370 day_dir.mkdir()
371 segment_dir = day_dir / "100500_300"
372 segment_dir.mkdir()
373 file_path = segment_dir / "audio.jsonl"
374
375 metadata = {"setting": "personal"}
376 entries = [
377 {"start": "00:00:01", "source": "mic", "text": "No speaker here"},
378 {"start": "00:00:05", "text": "No source or speaker"},
379 ]
380
381 lines = [json.dumps(metadata)]
382 lines.extend(json.dumps(entry) for entry in entries)
383 file_path.write_text("\n".join(lines) + "\n")
384
385 metadata_out, entries_out, formatted_text = load_transcript(file_path)
386
387 assert "[00:00:01] (mic) No speaker here" in formatted_text
388 assert "[00:00:05] No source or speaker" in formatted_text
389
390
391def test_load_transcript_formatted_text_error():
392 """Test formatted text for error cases."""
393 metadata, entries, formatted_text = load_transcript("/nonexistent/file.jsonl")
394
395 assert entries is None
396 assert "Error loading transcript:" in formatted_text
397 assert "not found" in formatted_text.lower()
398
399
400def test_load_transcript_formatted_text_no_day_in_path():
401 """Test formatted text when day can't be parsed from path."""
402 with tempfile.TemporaryDirectory() as tmpdir:
403 # File not in a YYYYMMDD directory
404 file_path = Path(tmpdir) / "100500_audio.jsonl"
405
406 metadata = {"setting": "test"}
407 entries = [{"start": "00:00:01", "text": "Test"}]
408
409 lines = [json.dumps(metadata)]
410 lines.extend(json.dumps(entry) for entry in entries)
411 file_path.write_text("\n".join(lines) + "\n")
412
413 metadata_out, entries_out, formatted_text = load_transcript(file_path)
414
415 # Should still work, just without "Start:" header
416 assert "Setting: test" in formatted_text
417 assert "[00:00:01] Test" in formatted_text
418 # Start header should not be present since we couldn't parse the day
419 assert "Start:" not in formatted_text