music on atproto
plyr.fm
1#!/usr/bin/env -S uv run --script --quiet
2"""backfill image_url for tracks with image_id but missing image_url.
3
4## Context
5
6PR #184 added image_url column to tracks table to eliminate N+1 R2 API calls.
7New uploads automatically populate image_url at creation time, but 15 legacy
8tracks uploaded before the PR still have image_url = NULL.
9
10This causes slow /tracks/liked endpoint performance because we fall back to
11calling track.get_image_url() which hits R2 API on every request.
12
13## What This Script Does
14
151. Finds all tracks with image_id but NULL image_url
162. Computes image_url by calling storage.get_url(image_id)
173. Updates database with computed URLs
184. Runs concurrently for performance
19
20## Usage
21
22```bash
23# dry run (show what would be updated)
24uv run scripts/backfill_image_urls.py --dry-run
25
26# actually update the database
27uv run scripts/backfill_image_urls.py
28
29# target specific environment
30DATABASE_URL=postgresql://... uv run scripts/backfill_image_urls.py
31```
32"""
33
34import asyncio
35import logging
36import sys
37from pathlib import Path
38
39# add src to path so we can import backend modules
40sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
41
42from sqlalchemy import select
43
44from backend.config import settings
45from backend.models import Track
46from backend.storage import storage
47from backend.storage.r2 import R2Storage
48from backend.utilities.database import db_session
49
50logging.basicConfig(
51 level=logging.INFO,
52 format="%(asctime)s - %(levelname)s - %(message)s",
53)
54logger = logging.getLogger(__name__)
55
56
57async def backfill_image_urls(dry_run: bool = False) -> None:
58 """backfill image_url for tracks with image_id but missing image_url."""
59
60 if not isinstance(storage, R2Storage):
61 logger.error("storage backend is not R2, cannot compute image URLs")
62 return
63
64 logger.info(f"storage backend: {settings.storage.backend}")
65
66 async with db_session() as db:
67 # find tracks with image_id but no image_url
68 stmt = select(Track).where(
69 Track.image_id.isnot(None), Track.image_url.is_(None)
70 )
71 result = await db.execute(stmt)
72 tracks = result.scalars().all()
73
74 if not tracks:
75 logger.info("no tracks need backfilling")
76 return
77
78 logger.info(f"found {len(tracks)} tracks needing image_url backfill")
79
80 if dry_run:
81 logger.info("dry run mode - showing tracks that would be updated:")
82 for track in tracks:
83 logger.info(
84 f" track {track.id}: {track.title} (image_id: {track.image_id})"
85 )
86 return
87
88 # compute image URLs concurrently
89 logger.info("computing image URLs from R2...")
90
91 async def compute_and_update(
92 track: Track,
93 ) -> tuple[int, str | None, Exception | None]:
94 """compute image_url for a track and return (track_id, url, error)."""
95 try:
96 url = await storage.get_url(track.image_id, file_type="image")
97 return (track.id, url, None)
98 except Exception as e:
99 return (track.id, None, e)
100
101 results = await asyncio.gather(*[compute_and_update(t) for t in tracks])
102
103 # update database with computed URLs
104 updated = 0
105 failed = 0
106
107 for track_id, image_url, error in results:
108 if image_url:
109 # find the track object
110 track = next(t for t in tracks if t.id == track_id)
111 track.image_url = image_url
112 updated += 1
113 logger.info(f"updated track {track_id}: {track.title}")
114 else:
115 failed += 1
116 track = next(t for t in tracks if t.id == track_id)
117 logger.error(
118 f"failed to compute URL for track {track_id} ({track.title}): {error}"
119 )
120
121 await db.commit()
122
123 logger.info(f"backfill complete: {updated} updated, {failed} failed")
124
125
126async def main() -> None:
127 """main entry point."""
128 dry_run = "--dry-run" in sys.argv
129
130 if dry_run:
131 logger.info("running in DRY RUN mode - no changes will be made")
132
133 await backfill_image_urls(dry_run=dry_run)
134
135
136if __name__ == "__main__":
137 asyncio.run(main())