personal memory agent
at main 419 lines 15 kB view raw
1# SPDX-License-Identifier: AGPL-3.0-only 2# Copyright (c) 2026 sol pbc 3 4"""Tests for observe.hear.load_transcript() function.""" 5 6import json 7import tempfile 8from pathlib import Path 9 10from observe.hear import load_transcript 11 12 13def test_load_transcript_native_with_metadata(): 14 """Test loading native transcript with topics/setting metadata.""" 15 with tempfile.TemporaryDirectory() as tmpdir: 16 file_path = Path(tmpdir) / "120000_audio.jsonl" 17 18 lines = [ 19 json.dumps({"topics": "meeting, standup", "setting": "work"}), 20 json.dumps({"start": "12:00:01", "source": "mic", "text": "Hello"}), 21 json.dumps({"start": "12:00:05", "source": "sys", "text": "Hi there"}), 22 ] 23 file_path.write_text("\n".join(lines) + "\n", encoding="utf-8") 24 25 metadata, entries, formatted_text = load_transcript(file_path) 26 27 assert entries is not None 28 assert metadata["topics"] == "meeting, standup" 29 assert metadata["setting"] == "work" 30 assert len(entries) == 2 31 assert entries[0]["start"] == "12:00:01" 32 assert entries[0]["text"] == "Hello" 33 assert entries[1]["start"] == "12:00:05" 34 assert entries[1]["text"] == "Hi there" 35 36 37def test_load_transcript_native_empty_metadata(): 38 """Test loading native transcript with empty metadata.""" 39 with tempfile.TemporaryDirectory() as tmpdir: 40 file_path = Path(tmpdir) / "120000_audio.jsonl" 41 42 lines = [ 43 json.dumps({}), 44 json.dumps({"start": "12:00:01", "source": "mic", "text": "Test"}), 45 ] 46 file_path.write_text("\n".join(lines) + "\n", encoding="utf-8") 47 48 metadata, entries, formatted_text = load_transcript(file_path) 49 50 assert entries is not None 51 assert metadata == {} 52 assert len(entries) == 1 53 assert entries[0]["text"] == "Test" 54 55 56def test_load_transcript_imported(): 57 """Test loading imported transcript with imported metadata.""" 58 with tempfile.TemporaryDirectory() as tmpdir: 59 file_path = Path(tmpdir) / "120000_imported_audio.jsonl" 60 61 lines = [ 62 json.dumps({"imported": {"id": "20240101_120000", "facet": "personal"}}), 63 json.dumps({"start": "12:00:01", "text": "Imported entry"}), 64 ] 65 file_path.write_text("\n".join(lines) + "\n", encoding="utf-8") 66 67 metadata, entries, formatted_text = load_transcript(file_path) 68 69 assert entries is not None 70 assert "imported" in metadata 71 assert metadata["imported"]["id"] == "20240101_120000" 72 assert metadata["imported"]["facet"] == "personal" 73 assert len(entries) == 1 74 assert entries[0]["text"] == "Imported entry" 75 76 77def test_load_transcript_empty_file(): 78 """Test loading an empty file returns error.""" 79 with tempfile.TemporaryDirectory() as tmpdir: 80 file_path = Path(tmpdir) / "empty.jsonl" 81 file_path.write_text("", encoding="utf-8") 82 83 metadata, entries, formatted_text = load_transcript(file_path) 84 85 assert entries is None 86 assert "error" in metadata 87 assert "empty" in metadata["error"].lower() 88 89 90def test_load_transcript_file_not_found(): 91 """Test loading non-existent file returns error.""" 92 metadata, entries, formatted_text = load_transcript("/nonexistent/file.jsonl") 93 94 assert entries is None 95 assert "error" in metadata 96 assert "not found" in metadata["error"].lower() 97 98 99def test_load_transcript_invalid_metadata_json(): 100 """Test loading file with invalid JSON in metadata line.""" 101 with tempfile.TemporaryDirectory() as tmpdir: 102 file_path = Path(tmpdir) / "bad_metadata.jsonl" 103 file_path.write_text("not valid json\n", encoding="utf-8") 104 105 metadata, entries, formatted_text = load_transcript(file_path) 106 107 assert entries is None 108 assert "error" in metadata 109 assert "metadata" in metadata["error"].lower() 110 111 112def test_load_transcript_invalid_entry_json(): 113 """Test loading file with invalid JSON in entry line.""" 114 with tempfile.TemporaryDirectory() as tmpdir: 115 file_path = Path(tmpdir) / "bad_entry.jsonl" 116 117 lines = [ 118 json.dumps({}), 119 "not valid json", 120 ] 121 file_path.write_text("\n".join(lines) + "\n", encoding="utf-8") 122 123 metadata, entries, formatted_text = load_transcript(file_path) 124 125 assert entries is None 126 assert "error" in metadata 127 assert "line 2" in metadata["error"].lower() 128 129 130def test_load_transcript_metadata_not_dict(): 131 """Test loading file where metadata is not a dict.""" 132 with tempfile.TemporaryDirectory() as tmpdir: 133 file_path = Path(tmpdir) / "bad_metadata_type.jsonl" 134 file_path.write_text('["not", "a", "dict"]\n', encoding="utf-8") 135 136 metadata, entries, formatted_text = load_transcript(file_path) 137 138 assert entries is None 139 assert "error" in metadata 140 assert "object" in metadata["error"].lower() 141 142 143def test_load_transcript_entry_not_dict(): 144 """Test loading file where entry is not a dict.""" 145 with tempfile.TemporaryDirectory() as tmpdir: 146 file_path = Path(tmpdir) / "bad_entry_type.jsonl" 147 148 lines = [ 149 json.dumps({}), 150 '"string entry"', 151 ] 152 file_path.write_text("\n".join(lines) + "\n", encoding="utf-8") 153 154 metadata, entries, formatted_text = load_transcript(file_path) 155 156 assert entries is None 157 assert "error" in metadata 158 assert "line 2" in metadata["error"].lower() 159 160 161def test_load_transcript_blank_lines_ignored(): 162 """Test that blank lines between entries are ignored.""" 163 with tempfile.TemporaryDirectory() as tmpdir: 164 file_path = Path(tmpdir) / "with_blanks.jsonl" 165 166 lines = [ 167 json.dumps({}), 168 "", 169 json.dumps({"start": "12:00:01", "text": "First"}), 170 "", 171 "", 172 json.dumps({"start": "12:00:02", "text": "Second"}), 173 "", 174 ] 175 file_path.write_text("\n".join(lines), encoding="utf-8") 176 177 metadata, entries, formatted_text = load_transcript(file_path) 178 179 assert entries is not None 180 assert len(entries) == 2 181 assert entries[0]["text"] == "First" 182 assert entries[1]["text"] == "Second" 183 184 185def test_load_transcript_only_metadata_no_entries(): 186 """Test loading file with only metadata line and no entries.""" 187 with tempfile.TemporaryDirectory() as tmpdir: 188 file_path = Path(tmpdir) / "only_metadata.jsonl" 189 file_path.write_text(json.dumps({"topics": "test"}) + "\n", encoding="utf-8") 190 191 metadata, entries, formatted_text = load_transcript(file_path) 192 193 assert entries is not None 194 assert metadata["topics"] == "test" 195 assert entries == [] 196 197 198def test_load_transcript_with_path_object(): 199 """Test that function accepts Path objects.""" 200 with tempfile.TemporaryDirectory() as tmpdir: 201 file_path = Path(tmpdir) / "test.jsonl" 202 203 lines = [ 204 json.dumps({}), 205 json.dumps({"start": "12:00:01", "text": "Test"}), 206 ] 207 file_path.write_text("\n".join(lines) + "\n", encoding="utf-8") 208 209 # Pass as Path object 210 metadata, entries, formatted_text = load_transcript(file_path) 211 212 assert entries is not None 213 assert len(entries) == 1 214 215 216def test_load_transcript_with_string_path(): 217 """Test that function accepts string paths.""" 218 with tempfile.TemporaryDirectory() as tmpdir: 219 file_path = Path(tmpdir) / "test.jsonl" 220 221 lines = [ 222 json.dumps({}), 223 json.dumps({"start": "12:00:01", "text": "Test"}), 224 ] 225 file_path.write_text("\n".join(lines) + "\n", encoding="utf-8") 226 227 # Pass as string 228 metadata, entries, formatted_text = load_transcript(str(file_path)) 229 230 assert entries is not None 231 assert len(entries) == 1 232 233 234def test_load_transcript_all_fields(): 235 """Test that all entry fields are preserved.""" 236 with tempfile.TemporaryDirectory() as tmpdir: 237 file_path = Path(tmpdir) / "complete.jsonl" 238 239 lines = [ 240 json.dumps({}), 241 json.dumps( 242 { 243 "start": "12:00:01", 244 "source": "mic", 245 "speaker": 1, 246 "text": "Complete entry", 247 "description": "confident", 248 } 249 ), 250 ] 251 file_path.write_text("\n".join(lines) + "\n", encoding="utf-8") 252 253 metadata, entries, formatted_text = load_transcript(file_path) 254 255 assert entries is not None 256 assert len(entries) == 1 257 entry = entries[0] 258 assert entry["start"] == "12:00:01" 259 assert entry["source"] == "mic" 260 assert entry["speaker"] == 1 261 assert entry["text"] == "Complete entry" 262 assert entry["description"] == "confident" 263 264 265def test_load_transcript_formatted_text_basic(): 266 """Test formatted text output with metadata and entries.""" 267 with tempfile.TemporaryDirectory() as tmpdir: 268 # Create test transcript file in YYYYMMDD/HHMMSS_LEN segment 269 day_dir = Path(tmpdir) / "20250615" 270 day_dir.mkdir() 271 segment_dir = day_dir / "100500_300" 272 segment_dir.mkdir() 273 file_path = segment_dir / "audio.jsonl" 274 275 # Write JSONL with metadata and entries 276 metadata = {"topics": ["meeting", "planning"], "setting": "work"} 277 entries = [ 278 {"start": "00:01:23", "source": "mic", "speaker": 1, "text": "Hello world"}, 279 {"start": "00:01:25", "source": "sys", "speaker": 2, "text": "Hi there"}, 280 { 281 "start": "00:01:30", 282 "source": "mic", 283 "speaker": 1, 284 "text": "How are you?", 285 }, 286 ] 287 288 lines = [json.dumps(metadata)] 289 lines.extend(json.dumps(entry) for entry in entries) 290 file_path.write_text("\n".join(lines) + "\n") 291 292 # Load transcript 293 metadata_out, entries_out, formatted_text = load_transcript(file_path) 294 295 # Verify formatted text output 296 assert "Start: 2025-06-15 10:05am" in formatted_text 297 assert "Topics: meeting, planning" in formatted_text 298 assert "Setting: work" in formatted_text 299 assert "[00:01:23] (mic) Speaker 1: Hello world" in formatted_text 300 assert "[00:01:25] (sys) Speaker 2: Hi there" in formatted_text 301 assert "[00:01:30] (mic) Speaker 1: How are you?" in formatted_text 302 303 304def test_load_transcript_formatted_text_minimal(): 305 """Test formatted text with minimal metadata.""" 306 with tempfile.TemporaryDirectory() as tmpdir: 307 day_dir = Path(tmpdir) / "20250615" 308 day_dir.mkdir() 309 segment_dir = day_dir / "100500_300" 310 segment_dir.mkdir() 311 file_path = segment_dir / "audio.jsonl" 312 313 # Minimal metadata (empty dict) 314 metadata = {} 315 entries = [ 316 {"start": "00:00:01", "text": "Simple text"}, 317 {"start": "00:00:05", "source": "mic", "text": "With source"}, 318 ] 319 320 lines = [json.dumps(metadata)] 321 lines.extend(json.dumps(entry) for entry in entries) 322 file_path.write_text("\n".join(lines) + "\n") 323 324 metadata_out, entries_out, formatted_text = load_transcript(file_path) 325 326 assert "Start: 2025-06-15 10:05am" in formatted_text 327 assert "[00:00:01] Simple text" in formatted_text 328 assert "[00:00:05] (mic) With source" in formatted_text 329 330 331def test_load_transcript_formatted_text_imported(): 332 """Test formatted text of imported transcript with facet metadata.""" 333 with tempfile.TemporaryDirectory() as tmpdir: 334 day_dir = Path(tmpdir) / "20250615" 335 day_dir.mkdir() 336 segment_dir = day_dir / "100500_300" 337 segment_dir.mkdir() 338 file_path = segment_dir / "imported_audio.jsonl" 339 340 metadata = { 341 "imported": {"id": "abc123", "facet": "uavionix"}, 342 "topics": ["discussion"], 343 } 344 entries = [ 345 { 346 "start": "00:00:10", 347 "source": "mic", 348 "speaker": 1, 349 "text": "Imported content", 350 } 351 ] 352 353 lines = [json.dumps(metadata)] 354 lines.extend(json.dumps(entry) for entry in entries) 355 file_path.write_text("\n".join(lines) + "\n") 356 357 metadata_out, entries_out, formatted_text = load_transcript(file_path) 358 359 assert "Start: 2025-06-15 10:05am" in formatted_text 360 assert "Topics: discussion" in formatted_text 361 assert "Facet: uavionix" in formatted_text 362 assert "Import ID: abc123" in formatted_text 363 assert "[00:00:10] (mic) Speaker 1: Imported content" in formatted_text 364 365 366def test_load_transcript_formatted_text_no_speaker(): 367 """Test formatted text for entries without speaker information.""" 368 with tempfile.TemporaryDirectory() as tmpdir: 369 day_dir = Path(tmpdir) / "20250615" 370 day_dir.mkdir() 371 segment_dir = day_dir / "100500_300" 372 segment_dir.mkdir() 373 file_path = segment_dir / "audio.jsonl" 374 375 metadata = {"setting": "personal"} 376 entries = [ 377 {"start": "00:00:01", "source": "mic", "text": "No speaker here"}, 378 {"start": "00:00:05", "text": "No source or speaker"}, 379 ] 380 381 lines = [json.dumps(metadata)] 382 lines.extend(json.dumps(entry) for entry in entries) 383 file_path.write_text("\n".join(lines) + "\n") 384 385 metadata_out, entries_out, formatted_text = load_transcript(file_path) 386 387 assert "[00:00:01] (mic) No speaker here" in formatted_text 388 assert "[00:00:05] No source or speaker" in formatted_text 389 390 391def test_load_transcript_formatted_text_error(): 392 """Test formatted text for error cases.""" 393 metadata, entries, formatted_text = load_transcript("/nonexistent/file.jsonl") 394 395 assert entries is None 396 assert "Error loading transcript:" in formatted_text 397 assert "not found" in formatted_text.lower() 398 399 400def test_load_transcript_formatted_text_no_day_in_path(): 401 """Test formatted text when day can't be parsed from path.""" 402 with tempfile.TemporaryDirectory() as tmpdir: 403 # File not in a YYYYMMDD directory 404 file_path = Path(tmpdir) / "100500_audio.jsonl" 405 406 metadata = {"setting": "test"} 407 entries = [{"start": "00:00:01", "text": "Test"}] 408 409 lines = [json.dumps(metadata)] 410 lines.extend(json.dumps(entry) for entry in entries) 411 file_path.write_text("\n".join(lines) + "\n") 412 413 metadata_out, entries_out, formatted_text = load_transcript(file_path) 414 415 # Should still work, just without "Start:" header 416 assert "Setting: test" in formatted_text 417 assert "[00:00:01] Test" in formatted_text 418 # Start header should not be present since we couldn't parse the day 419 assert "Start:" not in formatted_text