feat: extract and store track duration for teal.fm scrobbles (#469)

- add mutagen dependency for audio metadata extraction
- create backend/utilities/audio.py with extract_duration()
- extract duration during upload, store in track.extra["duration"]
- add backfill script for tracks uploaded before this feature

duration is now correctly passed to teal.fm scrobble records.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-authored-by: Claude <noreply@anthropic.com>

authored by zzstoatzz.io Claude and committed by GitHub 88fb5e46 e9e1e592

Changed files
+310 -3
backend
src
backend
api
utilities
scripts
+1
backend/pyproject.toml
··· 26 26 "aioboto3>=15.5.0", 27 27 "slowapi @ git+https://github.com/zzstoatzz/slowapi.git@fix-deprecation", 28 28 "orjson>=3.11.4", 29 + "mutagen>=1.47.0", 29 30 ] 30 31 31 32 requires-python = ">=3.11"
+2 -2
backend/src/backend/api/tracks/mutations.py
··· 293 293 audio_url=audio_url, 294 294 file_type=track.file_type, 295 295 album=track.album, 296 - duration=None, 296 + duration=track.duration, 297 297 features=track.features if track.features else None, 298 298 image_url=image_url_override or await track.get_image_url(), 299 299 ) ··· 527 527 audio_url=track.r2_url, 528 528 file_type=track.file_type, 529 529 album=track.album, 530 - duration=None, 530 + duration=track.duration, 531 531 features=track.features if track.features else None, 532 532 image_url=await track.get_image_url(), 533 533 )
+10 -1
backend/src/backend/api/tracks/uploads.py
··· 36 36 from backend.models import Artist, Tag, Track, TrackTag 37 37 from backend.models.job import JobStatus, JobType 38 38 from backend.storage import storage 39 + from backend.utilities.audio import extract_duration 39 40 from backend.utilities.database import db_session 40 41 from backend.utilities.hashing import CHUNK_SIZE 41 42 from backend.utilities.progress import R2ProgressTracker ··· 119 120 error=f"unsupported file type: {ext}", 120 121 ) 121 122 return 123 + 124 + # extract duration from audio file 125 + with open(file_path, "rb") as f: 126 + duration = extract_duration(f) 127 + if duration: 128 + logfire.info("extracted duration", duration=duration) 122 129 123 130 # save audio file 124 131 await job_service.update_progress( ··· 322 329 audio_url=r2_url, 323 330 file_type=ext[1:], 324 331 album=album, 325 - duration=None, 332 + duration=duration, 326 333 features=featured_artists if featured_artists else None, 327 334 image_url=image_url, 328 335 ) ··· 345 352 phase="database", 346 353 ) 347 354 extra = {} 355 + if duration: 356 + extra["duration"] = duration 348 357 album_record = None 349 358 if album: 350 359 extra["album"] = album
+43
backend/src/backend/utilities/audio.py
··· 1 + """audio file utilities.""" 2 + 3 + import io 4 + import logging 5 + from typing import BinaryIO 6 + 7 + from mutagen import File as MutagenFile 8 + 9 + logger = logging.getLogger(__name__) 10 + 11 + 12 + def extract_duration(audio_data: bytes | BinaryIO) -> int | None: 13 + """extract duration from audio file data. 14 + 15 + args: 16 + audio_data: raw audio bytes or file-like object 17 + 18 + returns: 19 + duration in seconds, or None if extraction fails 20 + """ 21 + try: 22 + if isinstance(audio_data, bytes): 23 + audio_data = io.BytesIO(audio_data) 24 + 25 + audio = MutagenFile(audio_data) 26 + if audio is None: 27 + logger.warning("mutagen could not identify audio format") 28 + return None 29 + 30 + if audio.info is None: 31 + logger.warning("audio file has no info") 32 + return None 33 + 34 + duration = getattr(audio.info, "length", None) 35 + if duration is None: 36 + logger.warning("audio file has no length info") 37 + return None 38 + 39 + return int(duration) 40 + 41 + except Exception as e: 42 + logger.warning(f"failed to extract duration: {e}") 43 + return None
+11
backend/uv.lock
··· 314 314 { name = "greenlet" }, 315 315 { name = "httpx" }, 316 316 { name = "logfire", extra = ["fastapi", "sqlalchemy"] }, 317 + { name = "mutagen" }, 317 318 { name = "orjson" }, 318 319 { name = "passlib", extra = ["bcrypt"] }, 319 320 { name = "psycopg", extra = ["binary"] }, ··· 357 358 { name = "greenlet", specifier = ">=3.2.4" }, 358 359 { name = "httpx", specifier = ">=0.28.0" }, 359 360 { name = "logfire", extras = ["fastapi", "sqlalchemy"], specifier = ">=4.14.2" }, 361 + { name = "mutagen", specifier = ">=1.47.0" }, 360 362 { name = "orjson", specifier = ">=3.11.4" }, 361 363 { name = "passlib", extras = ["bcrypt"], specifier = ">=1.7.4" }, 362 364 { name = "psycopg", extras = ["binary"], specifier = ">=3.2.12" }, ··· 1565 1567 { url = "https://files.pythonhosted.org/packages/ba/8f/0a60e501584145588be1af5cc829265701ba3c35a64aec8e07cbb71d39bb/multidict-6.7.0-cp314-cp314t-win_amd64.whl", hash = "sha256:09929cab6fcb68122776d575e03c6cc64ee0b8fca48d17e135474b042ce515cd", size = 53507, upload-time = "2025-10-06T14:51:53.672Z" }, 1566 1568 { url = "https://files.pythonhosted.org/packages/7f/ae/3148b988a9c6239903e786eac19c889fab607c31d6efa7fb2147e5680f23/multidict-6.7.0-cp314-cp314t-win_arm64.whl", hash = "sha256:cc41db090ed742f32bd2d2c721861725e6109681eddf835d0a82bd3a5c382827", size = 44804, upload-time = "2025-10-06T14:51:55.415Z" }, 1567 1569 { url = "https://files.pythonhosted.org/packages/b7/da/7d22601b625e241d4f23ef1ebff8acfc60da633c9e7e7922e24d10f592b3/multidict-6.7.0-py3-none-any.whl", hash = "sha256:394fc5c42a333c9ffc3e421a4c85e08580d990e08b99f6bf35b4132114c5dcb3", size = 12317, upload-time = "2025-10-06T14:52:29.272Z" }, 1570 + ] 1571 + 1572 + [[package]] 1573 + name = "mutagen" 1574 + version = "1.47.0" 1575 + source = { registry = "https://pypi.org/simple" } 1576 + sdist = { url = "https://files.pythonhosted.org/packages/81/e6/64bc71b74eef4b68e61eb921dcf72dabd9e4ec4af1e11891bbd312ccbb77/mutagen-1.47.0.tar.gz", hash = "sha256:719fadef0a978c31b4cf3c956261b3c58b6948b32023078a2117b1de09f0fc99", size = 1274186, upload-time = "2023-09-03T16:33:33.411Z" } 1577 + wheels = [ 1578 + { url = "https://files.pythonhosted.org/packages/b0/7a/620f945b96be1f6ee357d211d5bf74ab1b7fe72a9f1525aafbfe3aee6875/mutagen-1.47.0-py3-none-any.whl", hash = "sha256:edd96f50c5907a9539d8e5bba7245f62c9f520aef333d13392a79a4f70aca719", size = 194391, upload-time = "2023-09-03T16:33:29.955Z" }, 1568 1579 ] 1569 1580 1570 1581 [[package]]
+243
scripts/backfill_duration.py
··· 1 + #!/usr/bin/env -S uv run --script --quiet 2 + """backfill duration for tracks missing it. 3 + 4 + ## Context 5 + 6 + Tracks uploaded before duration extraction was implemented have NULL duration. 7 + This affects teal.fm scrobbles which should include duration metadata. 8 + 9 + ## What This Script Does 10 + 11 + 1. Finds all tracks with NULL duration in extra 12 + 2. Downloads audio files from R2 concurrently (semaphore-limited) 13 + 3. Extracts duration using mutagen 14 + 4. Updates database with extracted durations 15 + 16 + ## Usage 17 + 18 + ```bash 19 + # dry run (show what would be updated) 20 + uv run scripts/backfill_duration.py --dry-run 21 + 22 + # actually update the database 23 + uv run scripts/backfill_duration.py 24 + 25 + # limit concurrency (default: 10) 26 + uv run scripts/backfill_duration.py --concurrency 5 27 + 28 + # target specific environment 29 + DATABASE_URL=postgresql://... uv run scripts/backfill_duration.py 30 + ``` 31 + 32 + Run in order: dev → staging → prod 33 + """ 34 + 35 + import asyncio 36 + import io 37 + import logging 38 + import sys 39 + from pathlib import Path 40 + 41 + import httpx 42 + from mutagen import File as MutagenFile 43 + 44 + # add src to path so we can import backend modules 45 + sys.path.insert(0, str(Path(__file__).parent.parent / "backend" / "src")) 46 + 47 + from sqlalchemy import select, update 48 + 49 + from backend.models import Track 50 + from backend.utilities.database import db_session 51 + 52 + logging.basicConfig( 53 + level=logging.INFO, 54 + format="%(asctime)s - %(levelname)s - %(message)s", 55 + ) 56 + logger = logging.getLogger(__name__) 57 + 58 + 59 + def extract_duration_from_bytes(audio_data: bytes) -> int | None: 60 + """extract duration from audio bytes.""" 61 + try: 62 + audio = MutagenFile(io.BytesIO(audio_data)) 63 + if audio is None or audio.info is None: 64 + return None 65 + length = getattr(audio.info, "length", None) 66 + return int(length) if length else None 67 + except Exception as e: 68 + logger.warning(f"mutagen error: {e}") 69 + return None 70 + 71 + 72 + async def fetch_and_extract( 73 + client: httpx.AsyncClient, 74 + track: Track, 75 + semaphore: asyncio.Semaphore, 76 + ) -> tuple[int, int | None, str | None]: 77 + """fetch audio from R2 and extract duration. 78 + 79 + returns: (track_id, duration, error) 80 + """ 81 + async with semaphore: 82 + if not track.r2_url: 83 + return (track.id, None, "no r2_url") 84 + 85 + try: 86 + logger.info(f"fetching track {track.id}: {track.title[:40]}...") 87 + response = await client.get(track.r2_url, follow_redirects=True) 88 + response.raise_for_status() 89 + 90 + duration = extract_duration_from_bytes(response.content) 91 + if duration: 92 + logger.info(f" → {duration}s") 93 + return (track.id, duration, None) 94 + else: 95 + return (track.id, None, "could not extract duration") 96 + 97 + except httpx.HTTPStatusError as e: 98 + return (track.id, None, f"HTTP {e.response.status_code}") 99 + except Exception as e: 100 + return (track.id, None, str(e)) 101 + 102 + 103 + async def fetch_and_extract_simple( 104 + client: httpx.AsyncClient, 105 + track_id: int, 106 + title: str, 107 + r2_url: str | None, 108 + semaphore: asyncio.Semaphore, 109 + ) -> tuple[int, int | None, str | None]: 110 + """fetch audio header from R2 and extract duration. 111 + 112 + uses Range request to fetch only first 256KB (enough for metadata). 113 + falls back to full download if range request fails or duration not found. 114 + 115 + returns: (track_id, duration, error) 116 + """ 117 + async with semaphore: 118 + if not r2_url: 119 + return (track_id, None, "no r2_url") 120 + 121 + try: 122 + logger.info(f"fetching track {track_id}: {title[:40]}...") 123 + 124 + # try range request first (256KB should be enough for most formats) 125 + headers = {"Range": "bytes=0-262143"} 126 + response = await client.get(r2_url, headers=headers, follow_redirects=True) 127 + 128 + # 206 = partial content (range worked), 200 = full file (range ignored) 129 + if response.status_code not in (200, 206): 130 + response.raise_for_status() 131 + 132 + duration = extract_duration_from_bytes(response.content) 133 + if duration: 134 + logger.info(f" → {duration}s") 135 + return (track_id, duration, None) 136 + 137 + # if range didn't give us duration, try full file 138 + if response.status_code == 206: 139 + logger.info(" range request didn't work, fetching full file...") 140 + response = await client.get(r2_url, follow_redirects=True) 141 + response.raise_for_status() 142 + duration = extract_duration_from_bytes(response.content) 143 + if duration: 144 + logger.info(f" → {duration}s") 145 + return (track_id, duration, None) 146 + 147 + return (track_id, None, "could not extract duration") 148 + 149 + except httpx.HTTPStatusError as e: 150 + return (track_id, None, f"HTTP {e.response.status_code}") 151 + except Exception as e: 152 + return (track_id, None, str(e)) 153 + 154 + 155 + async def backfill_duration(dry_run: bool = False, concurrency: int = 10) -> None: 156 + """backfill duration for tracks missing it.""" 157 + 158 + # phase 1: query tracks needing backfill, then close connection 159 + track_data: list[tuple[int, str, str | None, dict | None]] = [] 160 + async with db_session() as db: 161 + stmt = select(Track).where( 162 + Track.extra["duration"].astext.is_(None) | ~Track.extra.has_key("duration") 163 + ) 164 + result = await db.execute(stmt) 165 + tracks = list(result.scalars().all()) 166 + 167 + if not tracks: 168 + logger.info("no tracks need duration backfill") 169 + return 170 + 171 + logger.info(f"found {len(tracks)} tracks needing duration backfill") 172 + 173 + if dry_run: 174 + logger.info("dry run mode - tracks that would be updated:") 175 + for track in tracks: 176 + logger.info(f" {track.id}: {track.title} ({track.r2_url})") 177 + return 178 + 179 + # extract plain data before closing session 180 + track_data = [(t.id, t.title, t.r2_url, t.extra) for t in tracks] 181 + 182 + # phase 2: download files and extract durations (no DB connection) 183 + semaphore = asyncio.Semaphore(concurrency) 184 + logger.info( 185 + f"processing {len(track_data)} tracks with concurrency={concurrency}..." 186 + ) 187 + 188 + async with httpx.AsyncClient(timeout=120.0) as client: 189 + results = await asyncio.gather( 190 + *[ 191 + fetch_and_extract_simple(client, tid, title, r2_url, semaphore) 192 + for tid, title, r2_url, _ in track_data 193 + ] 194 + ) 195 + 196 + # build update map 197 + updates: list[tuple[int, dict]] = [] 198 + failed = 0 199 + track_extras = {tid: extra or {} for tid, _, _, extra in track_data} 200 + track_titles = {tid: title for tid, title, _, _ in track_data} 201 + 202 + for track_id, duration, error in results: 203 + if duration: 204 + new_extra = {**track_extras[track_id], "duration": duration} 205 + updates.append((track_id, new_extra)) 206 + else: 207 + failed += 1 208 + logger.warning( 209 + f"failed track {track_id} ({track_titles[track_id]}): {error}" 210 + ) 211 + 212 + if not updates: 213 + logger.info("no updates to commit") 214 + return 215 + 216 + # phase 3: fresh connection to commit updates 217 + logger.info(f"committing {len(updates)} updates...") 218 + async with db_session() as db: 219 + for track_id, new_extra in updates: 220 + stmt = update(Track).where(Track.id == track_id).values(extra=new_extra) 221 + await db.execute(stmt) 222 + await db.commit() 223 + 224 + logger.info(f"backfill complete: {len(updates)} updated, {failed} failed") 225 + 226 + 227 + async def main() -> None: 228 + """main entry point.""" 229 + dry_run = "--dry-run" in sys.argv 230 + 231 + concurrency = 10 232 + for i, arg in enumerate(sys.argv): 233 + if arg == "--concurrency" and i + 1 < len(sys.argv): 234 + concurrency = int(sys.argv[i + 1]) 235 + 236 + if dry_run: 237 + logger.info("DRY RUN mode - no changes will be made") 238 + 239 + await backfill_duration(dry_run=dry_run, concurrency=concurrency) 240 + 241 + 242 + if __name__ == "__main__": 243 + asyncio.run(main())