feat: consolidate transcripts dual traversal into single scan_day()

+12 -2

apps/transcripts/routes.py

··· 29 29 from observe.hear import format_audio 30 30 from observe.screen import format_screen 31 31 from observe.utils import AUDIO_EXTENSIONS, VIDEO_EXTENSIONS 32 - from think.cluster import cluster_scan, cluster_segments 32 + from think.cluster import cluster_scan, cluster_segments, scan_day 33 33 from think.entities.journal import get_journal_principal, load_journal_entity 34 34 from think.models import get_usage_cost 35 35 from think.utils import day_dirs, day_path, segment_path ··· 91 91 return jsonify({"segments": segments}) 92 92 93 93 94 + @transcripts_bp.route("/api/day/<day>") 95 + def transcript_day_data(day: str) -> Any: 96 + """Return combined ranges and segments for a day in a single response.""" 97 + if not DATE_RE.fullmatch(day): 98 + return "", 404 99 + 100 + audio_ranges, screen_ranges, segments = scan_day(day) 101 + return jsonify({"audio": audio_ranges, "screen": screen_ranges, "segments": segments}) 102 + 103 + 94 104 @transcripts_bp.route("/api/serve_file/<day>/<path:encoded_path>") 95 105 def serve_file(day: str, encoded_path: str) -> Any: 96 106 """Serve actual media files for embedding.""" ··· 101 111 rel_path = encoded_path.replace("__", "/") 102 112 full_path = os.path.join(state.journal_root, day, rel_path) 103 113 104 - day_dir = str(day_path(day)) 114 + day_dir = str(day_path(day, create=False)) 105 115 if not os.path.commonpath([full_path, day_dir]) == day_dir: 106 116 return "", 403 107 117

+11 -16

apps/transcripts/workspace.html

··· 2266 2266 updateZoom(); 2267 2267 }).observe(zoom); 2268 2268 2269 - // Load transcript ranges and segments in parallel, initialize after both resolve 2270 - const rangesFetch = fetch(`/app/transcripts/api/ranges/${day}`).then(r => { 2271 - if (!r.ok) throw new Error(`Ranges failed: ${r.status}`); 2272 - return r.json(); 2273 - }); 2274 - const segmentsFetch = fetch(`/app/transcripts/api/segments/${day}`).then(r => { 2275 - if (!r.ok) throw new Error(`Segments failed: ${r.status}`); 2276 - return r.json(); 2277 - }); 2278 - 2279 - Promise.all([rangesFetch, segmentsFetch]) 2280 - .then(([rangesData, segmentsData]) => { 2269 + // Load combined transcript data 2270 + fetch(`/app/transcripts/api/day/${day}`) 2271 + .then(r => { 2272 + if (!r.ok) throw new Error(`Day data failed: ${r.status}`); 2273 + return r.json(); 2274 + }) 2275 + .then(data => { 2281 2276 // Apply dynamic timeline bounds from ranges 2282 - const bounds = computeTimelineBounds(rangesData); 2277 + const bounds = computeTimelineBounds(data); 2283 2278 timelineStart = bounds.start; 2284 2279 timelineEnd = bounds.end; 2285 2280 ··· 2296 2291 renderTimeline(); 2297 2292 2298 2293 // Add segment indicators from ranges 2299 - (rangesData.audio || []).forEach(rg => { 2294 + (data.audio || []).forEach(rg => { 2300 2295 const [s, e] = rg.map(parseTime); 2301 2296 addSegmentIndicator('audio', s, e, 0); 2302 2297 }); 2303 - (rangesData.screen || []).forEach(rg => { 2298 + (data.screen || []).forEach(rg => { 2304 2299 const [s, e] = rg.map(parseTime); 2305 2300 addSegmentIndicator('screen', s, e, 1); 2306 2301 }); 2307 2302 2308 2303 // Store segments and update zoom 2309 - allSegments = segmentsData.segments || []; 2304 + allSegments = data.segments || []; 2310 2305 updateZoom(); 2311 2306 2312 2307 // Check for hash fragment to auto-select segment

+59

tests/test_cluster.py

··· 576 576 assert _agent_matches_filter("meetings", filter_dict) is False 577 577 assert _agent_matches_filter("_todos_review", filter_dict) is True 578 578 assert _agent_matches_filter("flow", filter_dict) is False # Not in filter 579 + 580 + 581 + def test_scan_day_combined(tmp_path, monkeypatch): 582 + monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path)) 583 + day_dir = day_path("20240101") 584 + 585 + mod = importlib.import_module("think.cluster") 586 + 587 + first = day_dir / "default" / "090000_300" 588 + first.mkdir(parents=True) 589 + (first / "audio.jsonl").write_text("{}\n") 590 + (first / "screen.jsonl").write_text('{"raw": "screen.webm"}\n') 591 + 592 + second = day_dir / "default" / "093000_300" 593 + second.mkdir(parents=True) 594 + (second / "audio.jsonl").write_text("{}\n") 595 + 596 + audio_ranges, screen_ranges, segments = mod.scan_day("20240101") 597 + expected_ranges = mod.cluster_scan("20240101") 598 + expected_segments = mod.cluster_segments("20240101") 599 + 600 + assert audio_ranges == [("09:00", "09:15"), ("09:30", "09:45")] 601 + assert screen_ranges == [("09:00", "09:15")] 602 + assert segments == [ 603 + { 604 + "key": "090000_300", 605 + "start": "09:00", 606 + "end": "09:05", 607 + "types": ["audio", "screen"], 608 + "stream": "default", 609 + }, 610 + { 611 + "key": "093000_300", 612 + "start": "09:30", 613 + "end": "09:35", 614 + "types": ["audio"], 615 + "stream": "default", 616 + }, 617 + ] 618 + assert (audio_ranges, screen_ranges) == expected_ranges 619 + assert segments == expected_segments 620 + 621 + 622 + def test_scan_day_empty(tmp_path, monkeypatch): 623 + monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path)) 624 + 625 + mod = importlib.import_module("think.cluster") 626 + 627 + assert mod.scan_day("20250101") == ([], [], []) 628 + 629 + 630 + def test_day_path_create_false(tmp_path, monkeypatch): 631 + monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path)) 632 + 633 + missing = day_path("29990101", create=False) 634 + assert not missing.exists() 635 + 636 + created = day_path("29990101") 637 + assert created.exists()

+71 -83

think/cluster.py

··· 394 394 return ranges 395 395 396 396 397 - def cluster_scan(day: str) -> tuple[list[tuple[str, str]], list[tuple[str, str]]]: 398 - """Return 15-minute ranges with transcript and screen content for ``day``. 397 + def _detect_content_types(seg_path: Path) -> list[str]: 398 + """Detect content types present in a segment directory.""" 399 + types = [] 400 + if ( 401 + (seg_path / "audio.jsonl").exists() 402 + or any(seg_path.glob("*_audio.jsonl")) 403 + or any(seg_path.glob("*_transcript.jsonl")) 404 + or any(seg_path.glob("*_transcript.md")) 405 + or (seg_path / "imported.md").exists() 406 + ): 407 + types.append("audio") 408 + if (seg_path / "screen.jsonl").exists() or any(seg_path.glob("*_screen.jsonl")): 409 + types.append("screen") 410 + return types 411 + 412 + 413 + def scan_day( 414 + day: str, 415 + ) -> tuple[list[tuple[str, str]], list[tuple[str, str]], list[dict[str, Any]]]: 416 + """Single-pass scan returning both range aggregation and segment list. 417 + 418 + Combines the work of ``cluster_scan()`` and ``cluster_segments()`` 419 + into one ``iter_segments()`` traversal. 399 420 400 421 Args: 401 422 day: Day folder in ``YYYYMMDD`` format. 402 423 403 424 Returns: 404 - Two lists containing ``(start, end)`` pairs (``HH:MM``) for transcript and 405 - screen content respectively. 425 + Tuple of (audio_ranges, screen_ranges, segments) where ranges are 426 + ``(start, end)`` pairs in ``HH:MM`` format and segments is a list 427 + of dicts with ``key``, ``start``, ``end``, ``types``, and ``stream``. 406 428 """ 429 + from think.utils import iter_segments, segment_parse 407 430 408 - day_dir = str(day_path(day)) 409 - # day_path now ensures dir exists 410 - if not os.path.isdir(day_dir): 411 - return [], [] 431 + day_dir = day_path(day, create=False) 432 + if not day_dir.is_dir(): 433 + return [], [], [] 412 434 413 - date_str = _date_str(day_dir) 435 + date_str = _date_str(str(day_dir)) 436 + day_date = datetime.strptime(date_str, "%Y%m%d").date() 414 437 transcript_slots: set[datetime] = set() 415 438 percept_slots: set[datetime] = set() 416 - day_path_obj = Path(day_dir) 439 + segments: list[dict[str, Any]] = [] 417 440 418 - # Check timestamp subdirectories for content files 419 - from think.utils import iter_segments, segment_parse 441 + for stream_name, _, seg_path in iter_segments(day_dir): 442 + start_time, end_time = segment_parse(seg_path.name) 443 + 444 + types = _detect_content_types(seg_path) if start_time else [] 420 445 421 - for _stream, _seg_key, seg_path in iter_segments(day_path_obj): 422 - start_time, _ = segment_parse(seg_path.name) 423 - if start_time: 424 - # Found segment - combine with date to get datetime 425 - day_date = datetime.strptime(date_str, "%Y%m%d").date() 446 + if start_time and types: 426 447 dt = datetime.combine(day_date, start_time) 427 448 slot = dt.replace( 428 449 minute=dt.minute - (dt.minute % 15), second=0, microsecond=0 429 450 ) 430 - 431 - # Check for transcript content (legacy audio + new transcript convention) 432 - if ( 433 - (seg_path / "audio.jsonl").exists() 434 - or any(seg_path.glob("*_audio.jsonl")) 435 - or any(seg_path.glob("*_transcript.jsonl")) 436 - or any(seg_path.glob("*_transcript.md")) 437 - or (seg_path / "imported.md").exists() 438 - ): 451 + if "audio" in types: 439 452 transcript_slots.add(slot) 440 - 441 - # Check for screen content 442 - if (seg_path / "screen.jsonl").exists() or any( 443 - seg_path.glob("*_screen.jsonl") 444 - ): 453 + if "screen" in types: 445 454 percept_slots.add(slot) 446 455 447 - transcript_ranges = _slots_to_ranges(sorted(transcript_slots)) 448 - percept_ranges = _slots_to_ranges(sorted(percept_slots)) 449 - return transcript_ranges, percept_ranges 456 + if start_time and end_time and types: 457 + segments.append( 458 + { 459 + "key": seg_path.name, 460 + "start": start_time.strftime("%H:%M"), 461 + "end": end_time.strftime("%H:%M"), 462 + "types": types, 463 + "stream": stream_name, 464 + } 465 + ) 466 + 467 + audio_ranges = _slots_to_ranges(sorted(transcript_slots)) 468 + screen_ranges = _slots_to_ranges(sorted(percept_slots)) 469 + segments.sort(key=lambda s: s["start"]) 470 + return audio_ranges, screen_ranges, segments 471 + 472 + 473 + def cluster_scan(day: str) -> tuple[list[tuple[str, str]], list[tuple[str, str]]]: 474 + """Return 15-minute ranges with transcript and screen content for ``day``. 475 + 476 + Args: 477 + day: Day folder in ``YYYYMMDD`` format. 478 + 479 + Returns: 480 + Two lists containing ``(start, end)`` pairs (``HH:MM``) for transcript and 481 + screen content respectively. 482 + """ 483 + 484 + audio_ranges, screen_ranges, _ = scan_day(day) 485 + return audio_ranges, screen_ranges 450 486 451 487 452 488 def cluster_segments(day: str) -> list[dict[str, Any]]: ··· 465 501 - end: end time as HH:MM 466 502 - types: list of content types present ("audio", "screen", or both) 467 503 """ 468 - from think.utils import segment_parse 469 - 470 - day_dir = str(day_path(day)) 471 - if not os.path.isdir(day_dir): 472 - return [] 473 - 474 - from think.utils import iter_segments 475 - 476 - day_path_obj = Path(day_dir) 477 - segments: list[dict[str, Any]] = [] 478 - 479 - for stream_name, seg_key, seg_path in iter_segments(day_path_obj): 480 - start_time, end_time = segment_parse(seg_path.name) 481 - if not (start_time and end_time): 482 - continue 483 - 484 - types = [] 485 - # Check for transcript content (legacy audio + new transcript convention) 486 - if ( 487 - (seg_path / "audio.jsonl").exists() 488 - or any(seg_path.glob("*_audio.jsonl")) 489 - or any(seg_path.glob("*_transcript.jsonl")) 490 - or any(seg_path.glob("*_transcript.md")) 491 - or (seg_path / "imported.md").exists() 492 - ): 493 - types.append("audio") 494 - 495 - # Check for screen content 496 - if (seg_path / "screen.jsonl").exists() or any(seg_path.glob("*_screen.jsonl")): 497 - types.append("screen") 498 - 499 - if not types: 500 - continue 501 - 502 - start_str = start_time.strftime("%H:%M") 503 - end_str = end_time.strftime("%H:%M") 504 - 505 - segments.append( 506 - { 507 - "key": seg_path.name, 508 - "start": start_str, 509 - "end": end_str, 510 - "types": types, 511 - "stream": stream_name, 512 - } 513 - ) 514 - 515 - # Sort by start time 516 - segments.sort(key=lambda s: s["start"]) 504 + _, _, segments = scan_day(day) 517 505 return segments 518 506 519 507

+6 -3

think/utils.py

··· 118 118 return journal 119 119 120 120 121 - def day_path(day: Optional[str] = None) -> Path: 121 + def day_path(day: Optional[str] = None, *, create: bool = True) -> Path: 122 122 """Return absolute path for a day directory within the journal. 123 123 124 124 Parameters 125 125 ---------- 126 126 day : str, optional 127 127 Day in YYYYMMDD format. If None, uses today's date. 128 + create : bool, optional 129 + Create the day directory if it does not exist. Defaults to True. 128 130 129 131 Returns 130 132 ------- ··· 145 147 raise ValueError("day must be in YYYYMMDD format") 146 148 147 149 path = Path(journal) / day 148 - path.mkdir(parents=True, exist_ok=True) 150 + if create: 151 + path.mkdir(parents=True, exist_ok=True) 149 152 return path 150 153 151 154 ··· 268 271 if isinstance(day, Path): 269 272 day_dir = day 270 273 else: 271 - day_dir = day_path(day) 274 + day_dir = day_path(day, create=False) 272 275 273 276 if not day_dir.exists(): 274 277 return []